diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index 6e87c7d65298b1..e03ffe6f3b0867 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -5145,16 +5145,12 @@ void CodeGen::genArm64EmitterUnitTests() return; } - if (!compiler->opts.altJit) - { - // No point doing this in a "real" JIT. - return; - } - +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS // Mark the "fake" instructions in the output. printf("*************** In genArm64EmitterUnitTests()\n"); emitter* theEmitter = GetEmitter(); +#endif // ALL_ARM64_EMITTER_UNIT_TESTS #ifdef ALL_ARM64_EMITTER_UNIT_TESTS // We use this: @@ -7893,6 +7889,40 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_fcvtn2, EA_8BYTE, REG_V0, REG_V1); #endif +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // sadalp vector + theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + + // saddlp vector + theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + + // uadalp vector + theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + + // uaddlp vector + theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + #ifdef ALL_ARM64_EMITTER_UNIT_TESTS // // R_R floating point round to int, one dest, one source @@ -8645,6 +8675,272 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R_R(INS_zip2, EA_16BYTE, REG_V18, REG_V19, REG_V20, INS_OPTS_2D); #endif // ALL_ARM64_EMITTER_UNIT_TESTS +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // addhn vector + theEmitter->emitIns_R_R_R(INS_addhn, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_addhn, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_addhn, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // addhn2 vector + theEmitter->emitIns_R_R_R(INS_addhn2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_addhn2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_addhn2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // raddhn vector + theEmitter->emitIns_R_R_R(INS_raddhn, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_raddhn, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_raddhn, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // raddhn2 vector + theEmitter->emitIns_R_R_R(INS_raddhn2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_raddhn2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_raddhn2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // rsubhn vector + theEmitter->emitIns_R_R_R(INS_rsubhn, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_rsubhn, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_rsubhn, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // rsubhn2 vector + theEmitter->emitIns_R_R_R(INS_rsubhn2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_rsubhn2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_rsubhn2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // sabal vector + theEmitter->emitIns_R_R_R(INS_sabal, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_sabal, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_sabal, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // sabal2 vector + theEmitter->emitIns_R_R_R(INS_sabal2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_sabal2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_sabal2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // sabdl vector + theEmitter->emitIns_R_R_R(INS_sabdl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_sabdl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_sabdl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // sabdl2 vector + theEmitter->emitIns_R_R_R(INS_sabdl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_sabdl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_sabdl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // saddl vector + theEmitter->emitIns_R_R_R(INS_saddl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_saddl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_saddl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // saddl2 vector + theEmitter->emitIns_R_R_R(INS_saddl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_saddl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_saddl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // saddw vector + theEmitter->emitIns_R_R_R(INS_saddw, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_saddw, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_saddw, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // saddw2 vector + theEmitter->emitIns_R_R_R(INS_saddw2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_saddw2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_saddw2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // shadd vector + theEmitter->emitIns_R_R_R(INS_shadd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_shadd, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_shadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_shadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_shadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_shadd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // shsub vector + theEmitter->emitIns_R_R_R(INS_shsub, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_shsub, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_shsub, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_shsub, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_shsub, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_shsub, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // sqadd scalar + theEmitter->emitIns_R_R_R(INS_sqadd, EA_1BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_2BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_4BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_8BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_NONE); + + // sqadd vector + theEmitter->emitIns_R_R_R(INS_sqadd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_sqadd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // sqsub scalar + theEmitter->emitIns_R_R_R(INS_sqsub, EA_1BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_2BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_4BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_8BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_NONE); + + // sqsub vector + theEmitter->emitIns_R_R_R(INS_sqsub, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_sqsub, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // srhadd vector + theEmitter->emitIns_R_R_R(INS_srhadd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_srhadd, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_srhadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_srhadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_srhadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_srhadd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // ssubl vector + theEmitter->emitIns_R_R_R(INS_ssubl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ssubl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ssubl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // ssubl2 vector + theEmitter->emitIns_R_R_R(INS_ssubl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ssubl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ssubl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // ssubw vector + theEmitter->emitIns_R_R_R(INS_ssubw, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ssubw, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ssubw, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // ssubw2 vector + theEmitter->emitIns_R_R_R(INS_ssubw2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ssubw2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ssubw2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // subhn vector + theEmitter->emitIns_R_R_R(INS_subhn, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_subhn, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_subhn, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // subhn2 vector + theEmitter->emitIns_R_R_R(INS_subhn2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_subhn2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_subhn2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uabal vector + theEmitter->emitIns_R_R_R(INS_uabal, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uabal, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uabal, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // uabal2 vector + theEmitter->emitIns_R_R_R(INS_uabal2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uabal2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uabal2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uabdl vector + theEmitter->emitIns_R_R_R(INS_uabdl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uabdl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uabdl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // uabdl2 vector + theEmitter->emitIns_R_R_R(INS_uabdl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uabdl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uabdl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uaddl vector + theEmitter->emitIns_R_R_R(INS_uaddl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uaddl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uaddl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // uaddl2 vector + theEmitter->emitIns_R_R_R(INS_uaddl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uaddl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uaddl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uaddw vector + theEmitter->emitIns_R_R_R(INS_uaddw, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uaddw, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uaddw, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // uaddw2 vector + theEmitter->emitIns_R_R_R(INS_uaddw2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uaddw2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uaddw2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uhadd vector + theEmitter->emitIns_R_R_R(INS_uhadd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uhadd, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uhadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_uhadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uhadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uhadd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uhsub vector + theEmitter->emitIns_R_R_R(INS_uhsub, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uhsub, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uhsub, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_uhsub, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uhsub, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uhsub, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uqadd scalar + theEmitter->emitIns_R_R_R(INS_uqadd, EA_1BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_2BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_4BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_8BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_NONE); + + // uqadd vector + theEmitter->emitIns_R_R_R(INS_uqadd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uqadd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // uqsub scalar + theEmitter->emitIns_R_R_R(INS_uqsub, EA_1BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_2BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_4BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_NONE); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_8BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_NONE); + + // uqsub vector + theEmitter->emitIns_R_R_R(INS_uqsub, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_uqsub, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // urhadd vector + theEmitter->emitIns_R_R_R(INS_urhadd, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_urhadd, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_urhadd, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_urhadd, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_urhadd, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_urhadd, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // usubl vector + theEmitter->emitIns_R_R_R(INS_usubl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_usubl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_usubl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // usubl2 vector + theEmitter->emitIns_R_R_R(INS_usubl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_usubl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_usubl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // usubw vector + theEmitter->emitIns_R_R_R(INS_usubw, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_usubw, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_usubw, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // usubw2 vector + theEmitter->emitIns_R_R_R(INS_usubw2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_usubw2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_usubw2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + #ifdef ALL_ARM64_EMITTER_UNIT_TESTS // // R_R_R vector multiply @@ -8703,7 +8999,112 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V18, REG_V19, REG_V3, 0, INS_OPTS_8H); theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V20, REG_V21, REG_V4, 3, INS_OPTS_8H); theEmitter->emitIns_R_R_R_I(INS_mls, EA_16BYTE, REG_V22, REG_V23, REG_V5, 7, INS_OPTS_8H); +#endif // ALL_ARM64_EMITTER_UNIT_TESTS +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // pmull vector + theEmitter->emitIns_R_R_R(INS_pmull, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_pmull, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_1D); + + // pmull2 vector + theEmitter->emitIns_R_R_R(INS_pmull2, EA_16BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_pmull2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_2D); + + // smlal vector + theEmitter->emitIns_R_R_R(INS_smlal, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_smlal, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_smlal, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // smlal2 vector + theEmitter->emitIns_R_R_R(INS_smlal2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_smlal2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_smlal2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // smlsl vector + theEmitter->emitIns_R_R_R(INS_smlsl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_smlsl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_smlsl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // smlsl2 vector + theEmitter->emitIns_R_R_R(INS_smlsl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_smlsl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_smlsl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // smull vector + theEmitter->emitIns_R_R_R(INS_smull, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_smull, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_smull, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // smull2 vector + theEmitter->emitIns_R_R_R(INS_smull2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_smull2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_smull2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // umlal vector + theEmitter->emitIns_R_R_R(INS_umlal, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_umlal, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_umlal, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // umlal2 vector + theEmitter->emitIns_R_R_R(INS_umlal2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_umlal2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_umlal2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // umlsl vector + theEmitter->emitIns_R_R_R(INS_umlsl, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_umlsl, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_umlsl, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // umlsl2 vector + theEmitter->emitIns_R_R_R(INS_umlsl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_umlsl2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_umlsl2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // umull vector + theEmitter->emitIns_R_R_R(INS_umull, EA_8BYTE, REG_V0, REG_V1, REG_V2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_umull, EA_8BYTE, REG_V3, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_umull, EA_8BYTE, REG_V6, REG_V7, REG_V8, INS_OPTS_2S); + + // umull2 vector + theEmitter->emitIns_R_R_R(INS_umull2, EA_16BYTE, REG_V9, REG_V10, REG_V11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_umull2, EA_16BYTE, REG_V12, REG_V13, REG_V14, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_umull2, EA_16BYTE, REG_V15, REG_V16, REG_V17, INS_OPTS_4S); + + // smlal vector, by element + theEmitter->emitIns_R_R_R_I(INS_smlal, EA_8BYTE, REG_V0, REG_V1, REG_V2, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_smlal, EA_8BYTE, REG_V3, REG_V4, REG_V5, 1, INS_OPTS_2S); + + // smlal2 vector, by element + theEmitter->emitIns_R_R_R_I(INS_smlal2, EA_16BYTE, REG_V6, REG_V7, REG_V8, 7, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_smlal2, EA_16BYTE, REG_V9, REG_V10, REG_V11, 3, INS_OPTS_4S); + + // smlsl vector, by element + theEmitter->emitIns_R_R_R_I(INS_smlsl, EA_8BYTE, REG_V0, REG_V1, REG_V2, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_smlsl, EA_8BYTE, REG_V3, REG_V4, REG_V5, 1, INS_OPTS_2S); + + // smlsl2 vector, by element + theEmitter->emitIns_R_R_R_I(INS_smlsl2, EA_16BYTE, REG_V6, REG_V7, REG_V8, 7, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_smlsl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, 3, INS_OPTS_4S); + + // smull vector, by element + theEmitter->emitIns_R_R_R_I(INS_smull, EA_8BYTE, REG_V0, REG_V1, REG_V2, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_smull, EA_8BYTE, REG_V3, REG_V4, REG_V5, 1, INS_OPTS_2S); + + // smull2 vector, by element + theEmitter->emitIns_R_R_R_I(INS_smull2, EA_16BYTE, REG_V6, REG_V7, REG_V8, 7, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_smull2, EA_16BYTE, REG_V9, REG_V10, REG_V11, 3, INS_OPTS_4S); + + // umlsl2 vector, by element + theEmitter->emitIns_R_R_R_I(INS_umlsl2, EA_16BYTE, REG_V6, REG_V7, REG_V8, 7, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_umlsl2, EA_16BYTE, REG_V9, REG_V10, REG_V11, 3, INS_OPTS_4S); + + // umull vector, by element + theEmitter->emitIns_R_R_R_I(INS_umull, EA_8BYTE, REG_V0, REG_V1, REG_V2, 3, INS_OPTS_4H); + theEmitter->emitIns_R_R_R_I(INS_umull, EA_8BYTE, REG_V3, REG_V4, REG_V5, 1, INS_OPTS_2S); + + // umull2 vector, by element + theEmitter->emitIns_R_R_R_I(INS_umull2, EA_16BYTE, REG_V6, REG_V7, REG_V8, 7, INS_OPTS_8H); + theEmitter->emitIns_R_R_R_I(INS_umull2, EA_16BYTE, REG_V9, REG_V10, REG_V11, 3, INS_OPTS_4S); #endif // ALL_ARM64_EMITTER_UNIT_TESTS #ifdef ALL_ARM64_EMITTER_UNIT_TESTS @@ -8764,7 +9165,9 @@ void CodeGen::genArm64EmitterUnitTests() #endif // ALL_ARM64_EMITTER_UNIT_TESTS +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS printf("*************** End of genArm64EmitterUnitTests()\n"); +#endif // ALL_ARM64_EMITTER_UNIT_TESTS } #endif // defined(DEBUG) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index cc0076823e9d17..9ebf3de2f215ae 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -757,8 +757,6 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_DV_2T: // DV_2T .Q......XX...... ......nnnnnddddd Sd Vn (addv, saddlv, smaxv, sminv, uaddlv, // umaxv, uminv) assert(isValidVectorDatasize(id->idOpSize())); - elemsize = optGetElemsize(id->idInsOpt()); - assert((elemsize != EA_8BYTE) && (id->idInsOpt() != INS_OPTS_2S)); // can't use 2D or 1D or 2S assert(isVectorRegister(id->idReg1())); assert(isVectorRegister(id->idReg2())); break; @@ -788,6 +786,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) break; case IF_DV_3AI: // DV_3AI .Q......XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (vector by elem) + case IF_DV_3HI: // DV_3HI ........XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (smlal{2}, umlal{2} by element) assert(isValidVectorDatasize(id->idOpSize())); assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); assert(isVectorRegister(id->idReg1())); @@ -843,9 +842,8 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isValidVectorIndex(EA_16BYTE, elemsize, emitGetInsSC(id))); break; - case IF_DV_3E: // DV_3E ...........mmmmm ......nnnnnddddd Vd Vn Vm (scalar) + case IF_DV_3E: // DV_3E ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) assert(insOptsNone(id->idInsOpt())); - assert(id->idOpSize() == EA_8BYTE); assert(isVectorRegister(id->idReg1())); assert(isVectorRegister(id->idReg2())); assert(isVectorRegister(id->idReg3())); @@ -868,6 +866,14 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(isVectorRegister(id->idReg3())); break; + case IF_DV_3H: // DV_3H ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (addhn{2}, raddhn{2}, rsubhn{2}, + // subhn{2}, pmull{2}) + assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + assert(isVectorRegister(id->idReg1())); + assert(isVectorRegister(id->idReg2())); + assert(isVectorRegister(id->idReg3())); + break; + case IF_DV_4A: // DR_4A .........X.mmmmm .aaaaannnnnddddd Rd Rn Rm Ra (scalar) assert(isValidGeneralDatasize(id->idOpSize())); assert(isVectorRegister(id->idReg1())); @@ -921,7 +927,6 @@ bool emitter::emitInsMayWriteToGCReg(instrDesc* id) case IF_DR_3C: // DR_3C X..........mmmmm xxxsssnnnnnddddd Rd Rn Rm ext(Rm) LSL imm(0-4) case IF_DR_3D: // DR_3D X..........mmmmm cccc..nnnnnddddd Rd Rn Rm cond case IF_DR_3E: // DR_3E X........X.mmmmm ssssssnnnnnddddd Rd Rn Rm imm(0-63) - case IF_DV_3F: // DV_3F ...........mmmmm ......nnnnnddddd Vd Vn Vm (vector) - Vd both source and dest case IF_DR_4A: // DR_4A X..........mmmmm .aaaaannnnnddddd Rd Rn Rm Ra @@ -955,8 +960,13 @@ bool emitter::emitInsMayWriteToGCReg(instrDesc* id) case IF_DV_3C: // DV_3C .Q.........mmmmm ......nnnnnddddd Vd Vn Vm (vector) case IF_DV_3D: // DV_3D .........X.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) case IF_DV_3DI: // DV_3DI .........XLmmmmm ....H.nnnnnddddd Vd Vn Vm[] (scalar by elem) - case IF_DV_3E: // DV_3E ...........mmmmm ......nnnnnddddd Vd Vn Vm (scalar) + case IF_DV_3E: // DV_3E ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) + case IF_DV_3F: // DV_3F .Q......XX.mmmmm ......nnnnnddddd Vd Vn Vm (vector) case IF_DV_3G: // DV_3G .Q.........mmmmm .iiii.nnnnnddddd Vd Vn Vm imm (vector) + case IF_DV_3H: // DV_3H ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (addhn{2}, raddhn{2}, rsubhn{2}, + // subhn{2}, pmull{2}) + case IF_DV_3HI: // DV_3HI ........XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (smlal{2}, smlsl{2}, smull{2}, + // umlal{2}, umlsl{2}, umull{2} vector by elem) case IF_DV_4A: // DV_4A .........X.mmmmm .aaaaannnnnddddd Vd Va Vn Vm (scalar) // Tracked GC pointers cannot be placed into the SIMD registers. return false; @@ -1579,6 +1589,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode3H[3] = {IF_DR_3A, IF_DV_3A, IF_DV_3AI}; const static insFormat formatEncode3I[3] = {IF_DR_2E, IF_DR_2F, IF_DV_2M}; const static insFormat formatEncode3J[3] = {IF_LS_2D, IF_LS_3F, IF_LS_2E}; + const static insFormat formatEncode3K[3] = {IF_DR_3A, IF_DV_3H, IF_DV_3HI}; const static insFormat formatEncode2A[2] = {IF_DR_2E, IF_DR_2F}; const static insFormat formatEncode2B[2] = {IF_DR_3A, IF_DR_3B}; const static insFormat formatEncode2C[2] = {IF_DR_3A, IF_DI_2D}; @@ -1596,6 +1607,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode2O[2] = {IF_DV_3E, IF_DV_3A}; const static insFormat formatEncode2P[2] = {IF_DV_2Q, IF_DV_3B}; const static insFormat formatEncode2Q[2] = {IF_DV_2S, IF_DV_3A}; + const static insFormat formatEncode2R[2] = {IF_DV_3H, IF_DV_3HI}; code_t code = BAD_CODE; insFormat insFmt = emitInsFormat(ins); @@ -1879,6 +1891,17 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; + case IF_EN3K: + for (index = 0; index < 3; index++) + { + if (fmt == formatEncode3K[index]) + { + encoding_found = true; + break; + } + } + break; + case IF_EN2A: for (index = 0; index < 2; index++) { @@ -2066,92 +2089,27 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; - case IF_BI_0A: - case IF_BI_0B: - case IF_BI_0C: - case IF_BI_1A: - case IF_BI_1B: - case IF_BR_1A: - case IF_BR_1B: - case IF_LS_1A: - case IF_LS_2A: - case IF_LS_2B: - case IF_LS_2C: - case IF_LS_3A: - case IF_LS_3B: - case IF_LS_3C: - case IF_LS_3D: - case IF_LS_3E: - case IF_DI_1A: - case IF_DI_1B: - case IF_DI_1C: - case IF_DI_1D: - case IF_DI_1E: - case IF_DI_1F: - case IF_DI_2A: - case IF_DI_2B: - case IF_DI_2C: - case IF_DI_2D: - case IF_DR_1D: - case IF_DR_2A: - case IF_DR_2B: - case IF_DR_2C: - case IF_DR_2D: - case IF_DR_2E: - case IF_DR_2F: - case IF_DR_2G: - case IF_DR_2H: - case IF_DR_2I: - case IF_DR_3A: - case IF_DR_3B: - case IF_DR_3C: - case IF_DR_3D: - case IF_DR_3E: - case IF_DR_4A: - case IF_DV_1A: - case IF_DV_1B: - case IF_DV_1C: - case IF_DV_2A: - case IF_DV_2B: - case IF_DV_2C: - case IF_DV_2D: - case IF_DV_2E: - case IF_DV_2F: - case IF_DV_2G: - case IF_DV_2H: - case IF_DV_2I: - case IF_DV_2J: - case IF_DV_2K: - case IF_DV_2L: - case IF_DV_2M: - case IF_DV_2N: - case IF_DV_2O: - case IF_DV_2P: - case IF_DV_2R: - case IF_DV_2T: - case IF_DV_2U: - case IF_DV_3A: - case IF_DV_3AI: - case IF_DV_3B: - case IF_DV_3BI: - case IF_DV_3C: - case IF_DV_3D: - case IF_DV_3DI: - case IF_DV_3E: - case IF_DV_3F: - case IF_DV_3G: - case IF_DV_4A: - case IF_SN_0A: - case IF_SI_0A: - case IF_SI_0B: - - index = 0; - encoding_found = true; + case IF_EN2R: + for (index = 0; index < 2; index++) + { + if (fmt == formatEncode2R[index]) + { + encoding_found = true; + break; + } + } break; default: - - encoding_found = false; + if (fmt == insFmt) + { + encoding_found = true; + index = 0; + } + else + { + encoding_found = false; + } break; } @@ -3381,6 +3339,48 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } } +// For the given 'srcArrangement' returns the "widen" 'dstArrangement' specifying the destination vector register +// arrangement +// asserts and returns INS_OPTS_NONE if an invalid 'srcArrangement' value is passed +// +/*static*/ insOpts emitter::optWidenDstArrangement(insOpts srcArrangement) +{ + insOpts dstArrangement = INS_OPTS_NONE; + + switch (srcArrangement) + { + case INS_OPTS_8B: + dstArrangement = INS_OPTS_4H; + break; + + case INS_OPTS_16B: + dstArrangement = INS_OPTS_8H; + break; + + case INS_OPTS_4H: + dstArrangement = INS_OPTS_2S; + break; + + case INS_OPTS_8H: + dstArrangement = INS_OPTS_4S; + break; + + case INS_OPTS_2S: + dstArrangement = INS_OPTS_1D; + break; + + case INS_OPTS_4S: + dstArrangement = INS_OPTS_2D; + break; + + default: + assert(!" invalid 'srcArrangement' value"); + break; + } + + return dstArrangement; +} + // For the given 'conversion' returns the 'dstsize' specified by the conversion option /*static*/ emitAttr emitter::optGetDstsize(insOpts conversion) { @@ -4619,6 +4619,17 @@ void emitter::emitIns_R_R( fmt = IF_DV_2G; break; + case INS_sadalp: + case INS_saddlp: + case INS_uadalp: + case INS_uaddlp: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isValidArrangement(size, opt)); + assert((opt != INS_OPTS_1D) && (opt != INS_OPTS_2D)); // The encoding size = 11, Q = x is reserved + fmt = IF_DV_2T; + break; + default: unreached(); break; @@ -5432,6 +5443,30 @@ void emitter::emitIns_R_R_R( /* Figure out the encoding format of the instruction */ switch (ins) { + case INS_mul: + case INS_smull: + case INS_umull: + if (insOptsAnyArrangement(opt)) + { + // ASIMD instruction + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(isValidArrangement(size, opt)); + assert((opt != INS_OPTS_1D) && (opt != INS_OPTS_2D)); // The encoding size = 11, Q = x is reserved + if (ins == INS_mul) + { + fmt = IF_DV_3A; + } + else + { + fmt = IF_DV_3H; + } + break; + } + // Base instruction + __fallthrough; + case INS_lsl: case INS_lsr: case INS_asr: @@ -5443,10 +5478,8 @@ void emitter::emitIns_R_R_R( case INS_udiv: case INS_sdiv: case INS_mneg: - case INS_smull: case INS_smnegl: case INS_smulh: - case INS_umull: case INS_umnegl: case INS_umulh: case INS_lslv: @@ -5469,44 +5502,11 @@ void emitter::emitIns_R_R_R( fmt = IF_DR_3A; break; - case INS_mul: - if (insOptsNone(opt)) - { - // general register - assert(isValidGeneralDatasize(size)); - assert(isGeneralRegister(reg1)); - assert(isGeneralRegister(reg2)); - assert(isGeneralRegister(reg3)); - fmt = IF_DR_3A; - break; - } - __fallthrough; - - case INS_mla: - case INS_mls: - case INS_pmul: - assert(insOptsAnyArrangement(opt)); - assert(isVectorRegister(reg1)); - assert(isVectorRegister(reg2)); - assert(isVectorRegister(reg3)); - assert(isValidVectorDatasize(size)); - assert(isValidArrangement(size, opt)); - elemsize = optGetElemsize(opt); - if (ins == INS_pmul) - { - assert(elemsize == EA_1BYTE); // only supports 8B or 16B - } - else // INS_mul, INS_mla, INS_mls - { - assert(elemsize != EA_8BYTE); // can't use 2D or 1D - } - fmt = IF_DV_3A; - break; - case INS_add: case INS_sub: if (isVectorRegister(reg1)) { + // ASIMD instruction assert(isVectorRegister(reg2)); assert(isVectorRegister(reg3)); @@ -5527,6 +5527,7 @@ void emitter::emitIns_R_R_R( } break; } + // Base instruction __fallthrough; case INS_adds: @@ -5547,10 +5548,8 @@ void emitter::emitIns_R_R_R( if (insOptsAnyArrangement(opt)) { // Vector operation - assert(isValidVectorDatasize(size)); assert(isValidArrangement(size, opt)); - elemsize = optGetElemsize(opt); - assert(opt != INS_OPTS_1D); // Reserved encoding + assert(opt != INS_OPTS_1D); // The encoding size = 11, Q = 0 is reserved fmt = IF_DV_3A; } else @@ -5562,6 +5561,30 @@ void emitter::emitIns_R_R_R( } break; + case INS_sqadd: + case INS_sqsub: + case INS_uqadd: + case INS_uqsub: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + + if (insOptsAnyArrangement(opt)) + { + // Vector operation + assert(isValidArrangement(size, opt)); + assert(opt != INS_OPTS_1D); // The encoding size = 11, Q = 0 is reserved + fmt = IF_DV_3A; + } + else + { + // Scalar operation + assert(insOptsNone(opt)); + assert(isValidVectorElemsize(size)); + fmt = IF_DV_3E; + } + break; + case INS_fcmeq: case INS_fcmge: case INS_fcmgt: @@ -5590,20 +5613,33 @@ void emitter::emitIns_R_R_R( } break; + case INS_mla: + case INS_mls: case INS_saba: case INS_sabd: + case INS_shadd: + case INS_shsub: case INS_smax: case INS_smaxp: case INS_smin: case INS_sminp: + case INS_srhadd: case INS_uaba: case INS_uabd: + case INS_uhadd: + case INS_uhsub: case INS_umax: case INS_umaxp: case INS_umin: case INS_uminp: - assert(elemsize != EA_8BYTE); // can't use 2D or 1D - __fallthrough; + case INS_urhadd: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(isValidArrangement(size, opt)); + assert((opt != INS_OPTS_1D) && (opt != INS_OPTS_2D)); // The encoding size = 11, Q = x is reserved + fmt = IF_DV_3A; + break; case INS_addp: case INS_uzp1: @@ -5615,13 +5651,8 @@ void emitter::emitIns_R_R_R( assert(isVectorRegister(reg1)); assert(isVectorRegister(reg2)); assert(isVectorRegister(reg3)); - assert(insOptsAnyArrangement(opt)); - - // Vector operation - assert(isValidVectorDatasize(size)); assert(isValidArrangement(size, opt)); - elemsize = optGetElemsize(opt); - + assert(opt != INS_OPTS_1D); // The encoding size = 11, Q = 0 is reserved fmt = IF_DV_3A; break; @@ -5875,6 +5906,109 @@ void emitter::emitIns_R_R_R( fmt = IF_LS_3F; break; + case INS_addhn: + case INS_raddhn: + case INS_rsubhn: + case INS_subhn: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_8BYTE); + assert(isValidArrangement(size, opt)); + assert(opt != INS_OPTS_1D); // The encoding size = 11, Q = x is reserved. + fmt = IF_DV_3H; + break; + + case INS_addhn2: + case INS_raddhn2: + case INS_rsubhn2: + case INS_subhn2: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_16BYTE); + assert(isValidArrangement(size, opt)); + assert(opt != INS_OPTS_2D); // The encoding size = 11, Q = x is reserved. + fmt = IF_DV_3H; + break; + + case INS_sabal: + case INS_sabdl: + case INS_saddl: + case INS_saddw: + case INS_smlal: + case INS_smlsl: + case INS_ssubl: + case INS_ssubw: + case INS_uabal: + case INS_uabdl: + case INS_uaddl: + case INS_uaddw: + case INS_umlal: + case INS_umlsl: + case INS_usubl: + case INS_usubw: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_8BYTE); + assert((opt == INS_OPTS_8B) || (opt == INS_OPTS_4H) || (opt == INS_OPTS_2S)); + fmt = IF_DV_3H; + break; + + case INS_sabal2: + case INS_sabdl2: + case INS_saddl2: + case INS_saddw2: + case INS_smlal2: + case INS_smlsl2: + case INS_ssubl2: + case INS_ssubw2: + case INS_umlal2: + case INS_umlsl2: + case INS_smull2: + case INS_uabal2: + case INS_uabdl2: + case INS_uaddl2: + case INS_uaddw2: + case INS_usubl2: + case INS_umull2: + case INS_usubw2: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_16BYTE); + assert((opt == INS_OPTS_16B) || (opt == INS_OPTS_8H) || (opt == INS_OPTS_4S)); + fmt = IF_DV_3H; + break; + + case INS_pmul: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(isValidArrangement(size, opt)); + assert((opt == INS_OPTS_8B) || (opt == INS_OPTS_16B)); + fmt = IF_DV_3A; + break; + + case INS_pmull: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_8BYTE); + assert((opt == INS_OPTS_8B) || (opt == INS_OPTS_1D)); + fmt = IF_DV_3H; + break; + + case INS_pmull2: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_16BYTE); + assert((opt == INS_OPTS_16B) || (opt == INS_OPTS_2D)); + fmt = IF_DV_3H; + break; + default: unreached(); break; @@ -6080,6 +6214,38 @@ void emitter::emitIns_R_R_R_I(instruction ins, fmt = IF_DV_3G; break; + case INS_smlal: + case INS_smlsl: + case INS_smull: + case INS_umlal: + case INS_umlsl: + case INS_umull: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_8BYTE); + assert((opt == INS_OPTS_4H) || (opt == INS_OPTS_2S)); + elemsize = optGetElemsize(opt); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + fmt = IF_DV_3HI; + break; + + case INS_smlal2: + case INS_smlsl2: + case INS_smull2: + case INS_umlal2: + case INS_umlsl2: + case INS_umull2: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(isVectorRegister(reg3)); + assert(size == EA_16BYTE); + assert((opt == INS_OPTS_8H) || (opt == INS_OPTS_4S)); + elemsize = optGetElemsize(opt); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + fmt = IF_DV_3HI; + break; + default: unreached(); break; @@ -10667,7 +10833,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; - case IF_DV_3E: // DV_3E ...........mmmmm ......nnnnnddddd Vd Vn Vm (scalar) + case IF_DV_3E: // DV_3E ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) + code = emitInsCode(ins, fmt); + elemsize = id->idOpSize(); + code |= insEncodeElemsize(elemsize); // XX + code |= insEncodeReg_Vd(id->idReg1()); // ddddd + code |= insEncodeReg_Vn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vm(id->idReg3()); // mmmmm + dst += emitOutput_Instr(dst, code); + break; + case IF_DV_3F: // DV_3F ...........mmmmm ......nnnnnddddd Vd Vn Vm (vector) - source dest regs overlap code = emitInsCode(ins, fmt); code |= insEncodeReg_Vd(id->idReg1()); // ddddd @@ -10687,6 +10862,30 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; + case IF_DV_3H: // DV_3H ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (addhn{2}, raddhn{2}, rsubhn{2}, + // subhn{2}, pmull{2}) + code = emitInsCode(ins, fmt); + elemsize = optGetElemsize(id->idInsOpt()); + code |= insEncodeElemsize(elemsize); // XX + code |= insEncodeReg_Vm(id->idReg3()); // mmmmm + code |= insEncodeReg_Vn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vd(id->idReg1()); // ddddd + dst += emitOutput_Instr(dst, code); + break; + + case IF_DV_3HI: // DV_3HI ........XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (smlal{2}, umlal{2} by element) + code = emitInsCode(ins, fmt); + imm = emitGetInsSC(id); + elemsize = optGetElemsize(id->idInsOpt()); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + code |= insEncodeElemsize(elemsize); // XX + code |= insEncodeVectorIndexLMH(elemsize, imm); // LM H + code |= insEncodeReg_Vd(id->idReg1()); // ddddd + code |= insEncodeReg_Vn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vm(id->idReg3()); // mmmmm + dst += emitOutput_Instr(dst, code); + break; + case IF_DV_4A: // DV_4A .........X.mmmmm .aaaaannnnnddddd Vd Va Vn Vm (scalar) code = emitInsCode(ins, fmt); elemsize = id->idOpSize(); @@ -11955,8 +12154,19 @@ void emitter::emitDispIns( break; case IF_DR_2H: // DR_2H X........X...... ......nnnnnddddd Rd Rn - emitDispReg(id->idReg1(), size, true); - emitDispReg(id->idReg2(), size, false); + if ((ins == INS_uxtb) || (ins == INS_uxth)) + { + // There is no 64-bit variant of uxtb and uxth + // However, we allow idOpSize() to have EA_8BYTE value for these instruction + emitDispReg(id->idReg1(), EA_4BYTE, true); + emitDispReg(id->idReg2(), EA_4BYTE, false); + } + else + { + emitDispReg(id->idReg1(), size, true); + // sxtb, sxth and sxtb always operate on 32-bit source register + emitDispReg(id->idReg2(), EA_4BYTE, false); + } break; case IF_DR_2I: // DR_2I X..........mmmmm cccc..nnnnn.nzcv Rn Rm nzcv cond @@ -12236,9 +12446,17 @@ void emitter::emitDispIns( case IF_DV_2S: // DV_2S ........XX...... ......nnnnnddddd Sd Vn (addp - scalar) case IF_DV_2T: // DV_2T .Q......XX...... ......nnnnnddddd Sd Vn (addv, saddlv, smaxv, sminv, uaddlv, // umaxv, uminv) - elemsize = optGetElemsize(id->idInsOpt()); - emitDispReg(id->idReg1(), elemsize, true); - emitDispVectorReg(id->idReg2(), id->idInsOpt(), false); + if ((ins == INS_sadalp) || (ins == INS_saddlp) || (ins == INS_uadalp) || (ins == INS_uaddlp)) + { + emitDispVectorReg(id->idReg1(), optWidenDstArrangement(id->idInsOpt()), true); + emitDispVectorReg(id->idReg2(), id->idInsOpt(), false); + } + else + { + elemsize = optGetElemsize(id->idInsOpt()); + emitDispReg(id->idReg1(), elemsize, true); + emitDispVectorReg(id->idReg2(), id->idInsOpt(), false); + } break; case IF_DV_3A: // DV_3A .Q......XX.mmmmm ......nnnnnddddd Vd Vn Vm (vector) @@ -12266,7 +12484,7 @@ void emitter::emitDispIns( break; case IF_DV_3D: // DV_3D .........X.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) - case IF_DV_3E: // DV_3E ...........mmmmm ......nnnnnddddd Vd Vn Vm (scalar) + case IF_DV_3E: // DV_3E ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) emitDispReg(id->idReg1(), size, true); emitDispReg(id->idReg2(), size, true); emitDispReg(id->idReg3(), size, false); @@ -12310,6 +12528,52 @@ void emitter::emitDispIns( emitDispImm(emitGetInsSC(id), false); break; + case IF_DV_3H: // DV_3H ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (addhn{2}, raddhn{2}, rsubhn{2}, + // subhn{2}, pmull{2}) + if ((ins == INS_addhn) || (ins == INS_addhn2) || (ins == INS_raddhn) || (ins == INS_raddhn2) || + (ins == INS_subhn) || (ins == INS_subhn2) || (ins == INS_rsubhn) || (ins == INS_rsubhn2)) + { + // These are "high narrow" instruction i.e. their source registers are "wider" than the destination + // register. + emitDispVectorReg(id->idReg1(), id->idInsOpt(), true); + emitDispVectorReg(id->idReg2(), optWidenElemsize(id->idInsOpt()), true); + emitDispVectorReg(id->idReg3(), optWidenElemsize(id->idInsOpt()), false); + } + else + { + if (((ins == INS_pmull) && (id->idInsOpt() == INS_OPTS_1D)) || + (ins == (INS_pmull2) && (id->idInsOpt() == INS_OPTS_2D))) + { + // PMULL Vd.1Q, Vn.1D, Vm.1D + // PMULL2 Vd.1Q, Vn.2D, Vm.2D + printf("%s.1q, ", emitVectorRegName(id->idReg1())); + } + else + { + emitDispVectorReg(id->idReg1(), optWidenElemsize(id->idInsOpt()), true); + } + + if ((ins == INS_saddw) || (ins == INS_saddw2) || (ins == INS_uaddw) || (ins == INS_uaddw2) || + (ins == INS_ssubw) || (ins == INS_ssubw2) || (ins == INS_usubw) || (ins == INS_usubw2)) + { + emitDispVectorReg(id->idReg2(), optWidenElemsize(id->idInsOpt()), true); + } + else + { + emitDispVectorReg(id->idReg2(), id->idInsOpt(), true); + } + + emitDispVectorReg(id->idReg3(), id->idInsOpt(), false); + } + break; + + case IF_DV_3HI: + emitDispVectorReg(id->idReg1(), optWidenElemsize(id->idInsOpt()), true); + emitDispVectorReg(id->idReg2(), id->idInsOpt(), true); + elemsize = optGetElemsize(id->idInsOpt()); + emitDispVectorRegIndex(id->idReg3(), elemsize, emitGetInsSC(id), false); + break; + case IF_DV_4A: // DV_4A .........X.mmmmm .aaaaannnnnddddd Vd Va Vn Vm (scalar) emitDispReg(id->idReg1(), size, true); emitDispReg(id->idReg2(), size, true); @@ -12936,6 +13200,16 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_1C; break; + case INS_smaddl: + case INS_smsubl: + case INS_smnegl: + case INS_umaddl: + case INS_umsubl: + case INS_umnegl: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + default: // all other instructions perfScoreUnhandledInstruction(id, &result); @@ -13921,6 +14195,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cmgt: case INS_cmhi: case INS_cmhs: + case INS_shadd: + case INS_shsub: + case INS_srhadd: case INS_smax: case INS_smaxp: case INS_smin: @@ -13929,6 +14206,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_umaxp: case INS_umin: case INS_uminp: + case INS_uhadd: + case INS_uhsub: + case INS_urhadd: case INS_uzp1: case INS_uzp2: case INS_zip1: @@ -13955,7 +14235,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cmtst: case INS_pmul: case INS_sabd: + case INS_sqadd: + case INS_sqsub: case INS_uabd: + case INS_uqadd: + case INS_uqsub: result.insThroughput = PERFSCORE_THROUGHPUT_2X; result.insLatency = PERFSCORE_LATENCY_3C; break; @@ -14092,10 +14376,26 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_rshrn: case INS_rshrn2: + case INS_ssra: case INS_srshr: case INS_urshr: - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency = PERFSCORE_LATENCY_3C; + case INS_usra: + if (id->idOpSize() == EA_16BYTE) + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_srsra: + case INS_ursra: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; break; default: @@ -14140,6 +14440,90 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; + case IF_DV_3H: // addhn{2}, raddhn{2}, rsubhn{2}, sabal{2}, sabdl{2}, saddl{2}, saddw{2}, ssubl{2}, ssubw{2}, + // pmull{2} + case IF_DV_3HI: // subhn{2}, uabal{2}, uabdl{2}, uaddl{2}, uaddw{2}, usubl{2}, usubw{2} + switch (ins) + { + case INS_addhn: + case INS_addhn2: + case INS_sabdl: + case INS_sabdl2: + case INS_saddl: + case INS_saddl2: + case INS_saddw: + case INS_saddw2: + case INS_ssubl: + case INS_ssubl2: + case INS_ssubw: + case INS_ssubw2: + case INS_subhn: + case INS_subhn2: + case INS_uabdl: + case INS_uabdl2: + case INS_uaddl: + case INS_uaddl2: + case INS_uaddw: + case INS_uaddw2: + case INS_usubl: + case INS_usubl2: + case INS_usubw: + case INS_usubw2: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_raddhn: + case INS_raddhn2: + case INS_rsubhn: + case INS_rsubhn2: + case INS_sabal: + case INS_sabal2: + case INS_uabal: + case INS_uabal2: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_smlal: + case INS_smlal2: + case INS_smlsl: + case INS_smlsl2: + case INS_smull: + case INS_smull2: + case INS_umlal: + case INS_umlal2: + case INS_umlsl: + case INS_umlsl2: + case INS_umull: + case INS_umull2: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_pmull: + case INS_pmull2: + if ((id->idInsOpt() == INS_OPTS_8B) || (id->idInsOpt() == INS_OPTS_16B)) + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + // Crypto polynomial (64x64) multiply long + assert((id->idInsOpt() == INS_OPTS_1D) || (id->idInsOpt() == INS_OPTS_2D)); + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + case IF_SI_0A: // brk imm16 result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency = PERFSCORE_LATENCY_1C; @@ -14165,6 +14549,18 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_4C; break; + case INS_sadalp: + case INS_uadalp: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_saddlp: + case INS_uaddlp: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + default: // all other instructions perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index f6286a6062360a..751d0fe28a5c93 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -438,6 +438,11 @@ static emitAttr optGetElemsize(insOpts arrangement); // For the given 'arrangement' returns the 'widen-arrangement' specified by the vector register arrangement static insOpts optWidenElemsize(insOpts arrangement); +// For the given 'srcArrangement' returns the "widen" 'dstArrangement' specifying the destination vector register +// arrangement +// of Long Pairwise instructions. Note that destination vector elements twice as long as the source vector elements. +static insOpts optWidenDstArrangement(insOpts srcArrangement); + // For the given 'conversion' returns the 'dstsize' specified by the conversion option static emitAttr optGetDstsize(insOpts conversion); diff --git a/src/coreclr/src/jit/emitfmtsarm64.h b/src/coreclr/src/jit/emitfmtsarm64.h index fe8097217d1a1e..16244c1a34285a 100644 --- a/src/coreclr/src/jit/emitfmtsarm64.h +++ b/src/coreclr/src/jit/emitfmtsarm64.h @@ -71,6 +71,7 @@ IF_DEF(EN3G, IS_NONE, NONE) // Instruction has 3 possible encoding types, type G IF_DEF(EN3H, IS_NONE, NONE) // Instruction has 3 possible encoding types, type H IF_DEF(EN3I, IS_NONE, NONE) // Instruction has 3 possible encoding types, type I IF_DEF(EN3J, IS_NONE, NONE) // Instruction has 3 possible encoding types, type J +IF_DEF(EN3K, IS_NONE, NONE) // Instruction has 3 possible encoding types, type K IF_DEF(EN2A, IS_NONE, NONE) // Instruction has 2 possible encoding types, type A IF_DEF(EN2B, IS_NONE, NONE) // Instruction has 2 possible encoding types, type B IF_DEF(EN2C, IS_NONE, NONE) // Instruction has 2 possible encoding types, type C @@ -88,6 +89,7 @@ IF_DEF(EN2N, IS_NONE, NONE) // Instruction has 2 possible encoding types, type N IF_DEF(EN2O, IS_NONE, NONE) // Instruction has 2 possible encoding types, type O IF_DEF(EN2P, IS_NONE, NONE) // Instruction has 2 possible encoding types, type P IF_DEF(EN2Q, IS_NONE, NONE) // Instruction has 2 possible encoding types, type Q +IF_DEF(EN2R, IS_NONE, NONE) // Instruction has 2 possible encoding types, type R ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // @@ -207,18 +209,20 @@ IF_DEF(DV_2S, IS_NONE, NONE) // DV_2S ........XX...... ......nnnnnddddd S IF_DEF(DV_2T, IS_NONE, NONE) // DV_2T .Q......XX...... ......nnnnnddddd Sd Vn (addv, saddlv, smaxv, sminv, uaddlv, umaxv, uminv) IF_DEF(DV_2U, IS_NONE, NONE) // DV_2U ................ ......nnnnnddddd Sd Sn (sha1h) -IF_DEF(DV_3A, IS_NONE, NONE) // DV_3A .Q......XX.mmmmm ......nnnnnddddd Vd Vn Vm (vector) -IF_DEF(DV_3AI, IS_NONE, NONE) // DV_3AI .Q......XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (vector by elem) -IF_DEF(DV_3B, IS_NONE, NONE) // DV_3B .Q.......X.mmmmm ......nnnnnddddd Vd Vn Vm (vector) -IF_DEF(DV_3BI, IS_NONE, NONE) // DV_3BI .Q.......XLmmmmm ....H.nnnnnddddd Vd Vn Vm[] (vector by elem) -IF_DEF(DV_3C, IS_NONE, NONE) // DV_3C .Q.........mmmmm ......nnnnnddddd Vd Vn Vm (vector) -IF_DEF(DV_3D, IS_NONE, NONE) // DV_3D .........X.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) -IF_DEF(DV_3DI, IS_NONE, NONE) // DV_3DI .........XLmmmmm ....H.nnnnnddddd Vd Vn Vm[] (scalar by elem) -IF_DEF(DV_3E, IS_NONE, NONE) // DV_3E ...........mmmmm ......nnnnnddddd Vd Vn Vm (scalar) -IF_DEF(DV_3F, IS_NONE, NONE) // DV_3F ...........mmmmm ......nnnnnddddd Qd Sn Vm (Qd used as both source and destination) -IF_DEF(DV_3G, IS_NONE, NONE) // DV_3G .Q.........mmmmm .iiii.nnnnnddddd Vd Vn Vm imm (vector) - -IF_DEF(DV_4A, IS_NONE, NONE) // DV_4A .........X.mmmmm .aaaaannnnnddddd Vd Vn Vm Va (scalar) +IF_DEF(DV_3A, IS_NONE, NONE) // DV_3A .Q......XX.mmmmm ......nnnnnddddd Vd Vn Vm (vector) +IF_DEF(DV_3AI, IS_NONE, NONE) // DV_3AI .Q......XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (vector by elem) +IF_DEF(DV_3B, IS_NONE, NONE) // DV_3B .Q.......X.mmmmm ......nnnnnddddd Vd Vn Vm (vector) +IF_DEF(DV_3BI, IS_NONE, NONE) // DV_3BI .Q.......XLmmmmm ....H.nnnnnddddd Vd Vn Vm[] (vector by elem) +IF_DEF(DV_3C, IS_NONE, NONE) // DV_3C .Q.........mmmmm ......nnnnnddddd Vd Vn Vm (vector) +IF_DEF(DV_3D, IS_NONE, NONE) // DV_3D .........X.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) +IF_DEF(DV_3DI, IS_NONE, NONE) // DV_3DI .........XLmmmmm ....H.nnnnnddddd Vd Vn Vm[] (scalar by elem) +IF_DEF(DV_3E, IS_NONE, NONE) // DV_3E ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (scalar) +IF_DEF(DV_3F, IS_NONE, NONE) // DV_3F ...........mmmmm ......nnnnnddddd Qd Sn Vm (Qd used as both source and destination) +IF_DEF(DV_3G, IS_NONE, NONE) // DV_3G .Q.........mmmmm .iiii.nnnnnddddd Vd Vn Vm imm (vector) +IF_DEF(DV_3H, IS_NONE, NONE) // DV_3H ........XX.mmmmm ......nnnnnddddd Vd Vn Vm (addhn{2}, raddhn{2}, rsubhn{2}, pmull{2}, smlal{2}, subhn{2}, umlal{2} vector) +IF_DEF(DV_3HI, IS_NONE, NONE) // DV_3HI ........XXLMmmmm ....H.nnnnnddddd Vd Vn Vm[] (smlal{2}, smlsl{2}, smull{2}, umlal{2}, umlsl{2}, umull{2} vector by elem) + +IF_DEF(DV_4A, IS_NONE, NONE) // DV_4A .........X.mmmmm .aaaaannnnnddddd Vd Vn Vm Va (scalar) IF_DEF(SN_0A, IS_NONE, NONE) // SN_0A ................ ................ IF_DEF(SI_0A, IS_NONE, NONE) // SI_0A ...........iiiii iiiiiiiiiii..... imm16 diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 12860fec6a7096..1b8e99e587b3f8 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -513,6 +513,22 @@ INST3(ld4r, "ld4r", 0,LD, IF_EN3J, 0x0D60E000, 0x0DE0E000, 0x0DFFE000) // ld4r {Vt-Vt4},[Xn],Xm LS_3F 0Q001101111mmmmm 1110ssnnnnnttttt 0DE0 E000 post-indexed by a register // ld4r {Vt-Vt4},[Xn],#8 LS_2E 0Q00110111111111 1110ssnnnnnttttt 0DFF E000 post-indexed by an immediate +INST3(smull, "smull", 0, 0, IF_EN3K, 0x9B207C00, 0x0E20C000, 0x0F00A000) + // C6.2.243 SMULL + // C7.2.272 SMULL, SMULL2 (by element) + // C7.2.273 SMULL, SMULL2 (vector) + // smull Rd,Rn,Rm DR_3A 10011011001mmmmm 011111nnnnnddddd 9B20 7C00 + // smull Vd,Vn,Vm DV_3H 0000111000100000 1100000000000000 0E20 C000 Vd,Vn,Vm (vector) + // smull Vd,Vn,Vm[] DV_3HI 00001111XXLMmmmm 1010H0nnnnnddddd 0F00 A000 Vd,Vn,Vm[] (vector by elem) + +INST3(umull, "umull", 0, 0, IF_EN3K, 0x9BA07C00, 0x2E20C000, 0x2F00A000) + // C6.2.340 UMULL + // C7.2.362 UMULL, UMULL2 (by element) + // C7.2.363 UMULL, UMULL2 (vector) + // umull Rd,Rn,Rm DR_3A 10011011101mmmmm 011111nnnnnddddd 9BA0 7C00 + // umull Vd,Vn,Vm DV_3H 00101110XX1mmmmm 110000nnnnnddddd 2E20 C000 Vd,Vn,Vm (vector) + // umull Vd,Vn,Vm[] DV_3HI 00101111XXLMmmmm 1010H0nnnnnddddd 2F00 A000 Vd,Vn,Vm[] (vector by elem) + // enum name FP LD/ST DR_2E DR_2F INST2(negs, "negs", 0, 0, IF_EN2A, 0x6B0003E0, 0x6B0003E0) // negs Rd,Rm DR_2E X1101011000mmmmm 00000011111ddddd 6B00 03E0 @@ -742,6 +758,66 @@ INST2(mls, "mls", 0, 0, IF_EN2M, 0x2E209400, 0x2F004000) // mls Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 100101nnnnnddddd 2E20 9400 Vd,Vn,Vm (vector) // mls Vd,Vn,Vm[] DV_3AI 0Q101111XXLMmmmm 0100H0nnnnnddddd 2F00 4000 Vd,Vn,Vm[] (vector by elem) +INST2(smlal, "smlal", 0, 0, IF_EN2R, 0x0E208000, 0x0F002000) + // C7.2.267 SMLAL, SMLAL2 (by element) + // C7.2.268 SMLAL, SMLAL2 (vector) + // smlal Vd,Vn,Vm DV_3H 00001110XX1mmmmm 100000nnnnnddddd 0E20 8000 Vd,Vn,Vm (vector) + // smlal Vd,Vn,Vm[] DV_3HI 00001111XXLMmmmm 0010H0nnnnnddddd 0F00 2000 Vd,Vn,Vm[] (vector by elem) + +INST2(smlal2, "smlal2", 0, 0, IF_EN2R, 0x4E208000, 0x4F002000) + // C7.2.267 SMLAL, SMLAL2 (by element) + // C7.2.268 SMLAL, SMLAL2 (vector) + // smlal2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 100000nnnnnddddd 4E20 8000 Vd,Vn,Vm (vector) + // smlal2 Vd,Vn,Vm[] DV_3HI 01001111XXLMmmmm 0010H0nnnnnddddd 4F00 2000 Vd,Vn,Vm[] (vector by elem) + +INST2(smlsl, "smlsl", 0, 0, IF_EN2R, 0x0E20A000, 0x0F006000) + // C7.2.269 SMLSL, SMLSL2 (by element) + // C7.2.270 SMLSL, SMLSL2 (vector) + // smlsl Vd,Vn,Vm DV_3H 00001110XX1mmmmm 101000nnnnnddddd 0E20 A000 Vd,Vn,Vm (vector) + // smlsl Vd,Vn,Vm[] DV_3HI 00001111XXLMmmmm 0110H0nnnnnddddd 0F00 6000 Vd,Vn,Vm[] (vector by elem) + +INST2(smlsl2, "smlsl2", 0, 0, IF_EN2R, 0x4E20A000, 0x4F006000) + // C7.2.269 SMLSL, SMLSL2 (by element) + // C7.2.270 SMLSL, SMLSL2 (vector) + // smlsl2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 101000nnnnnddddd 4E20 A000 Vd,Vn,Vm (vector) + // smlsl2 Vd,Vn,Vm[] DV_3HI 01001111XXLMmmmm 0110H0nnnnnddddd 4F00 6000 Vd,Vn,Vm[] (vector by elem) + +INST2(smull2, "smull2", 0, 0, IF_EN2R, 0x4E20C000, 0x4F00A000) + // C7.2.272 SMULL, SMULL2 (by element) + // C7.2.273 SMULL, SMULL2 (vector) + // smull2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 110000nnnnnddddd 4E20 C000 Vd,Vn,Vm (vector) + // smull2 Vd,Vn,Vm[] DV_3HI 01001111XXLMmmmm 1010H0nnnnnddddd 4F00 A000 Vd,Vn,Vm[] (vector by elem) + +INST2(umlal, "umlal", 0, 0, IF_EN2R, 0x2E208000, 0x2F002000) + // C7.2.357 UMLAL, UMLAL2 (by element) + // C7.2.358 UMLAL, UMLAL2 (vector) + // umlal Vd,Vn,Vm DV_3H 00101110XX1mmmmm 100000nnnnnddddd 2E20 8000 Vd,Vn,Vm (vector) + // umlal Vd,Vn,Vm[] DV_3HI 00101111XXLMmmmm 0010H0nnnnnddddd 2F00 2000 Vd,Vn,Vm[] (vector by elem) + +INST2(umlal2, "umlal2", 0, 0, IF_EN2R, 0x6E208000, 0x6F002000) + // C7.2.357 UMLAL, UMLAL2 (by element) + // C7.2.358 UMLAL, UMLAL2 (vector) + // umlal2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 100000nnnnnddddd 6E20 8000 Vd,Vn,Vm (vector) + // umlal2 Vd,Vn,Vm[] DV_3HI 01101111XXLMmmmm 0010H0nnnnnddddd 6F00 2000 Vd,Vn,Vm[] (vector by elem) + +INST2(umlsl, "umlsl", 0, 0, IF_EN2R, 0x2E20A000, 0x2F006000) + // C7.2.359 UMLSL, UMLSL2 (by element) + // C7.2.360 UMLSL, UMLSL2 (vector) + // umlsl Vd,Vn,Vm DV_3H 00101110XX1mmmmm 101000nnnnnddddd 2E20 A000 Vd,Vn,Vm (vector) + // umlsl Vd,Vn,Vm[] DV_3HI 00101111XXLMmmmm 0110H0nnnnnddddd 2F00 6000 Vd,Vn,Vm[] (vector by elem) + +INST2(umlsl2, "umlsl2", 0, 0, IF_EN2R, 0x6E20A000, 0x6F006000) + // C7.2.359 UMLSL, UMLSL2 (by element) + // C7.2.360 UMLSL, UMLSL2 (vector) + // umlsl2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 101000nnnnnddddd 6E20 A000 Vd,Vn,Vm (vector) + // umlsl2 Vd,Vn,Vm[] DV_3HI 01101111XXLMmmmm 0110H0nnnnnddddd 6F00 6000 Vd,Vn,Vm[] (vector by elem) + +INST2(umull2, "umull2", 0, 0, IF_EN2R, 0x6E20C000, 0x6F00A000) + // C7.2.362 UMULL, UMULL2 (by element) + // C7.2.363 UMULL, UMULL2 (vector) + // umull2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 110000nnnnnddddd 6E20 C000 Vd,Vn,Vm (vector) + // umull2 Vd,Vn,Vm[] DV_3HI 01101111XXLMmmmm 1010H0nnnnnddddd 6F00 A000 Vd,Vn,Vm[] (vector by elem) + // enum name FP LD/ST DV_2N DV_2O INST2(sshr, "sshr", 0, 0, IF_EN2N, 0x5F000400, 0x0F000400) // sshr Vd,Vn,imm DV_2N 010111110iiiiiii 000001nnnnnddddd 5F00 0400 Vd Vn imm (shift - scalar) @@ -800,6 +876,26 @@ INST2(cmtst, "cmtst", 0, 0, IF_EN2O, 0x5EE08C00, 0x0E208C00) // cmtst Vd,Vn,Vm DV_3E 01011110111mmmmm 100011nnnnnddddd 5EE0 8C00 Vd,Vn,Vm (scalar) // cmtst Vd,Vn,Vm DV_3A 0Q001110XX1mmmmm 100011nnnnnddddd 0E20 8C00 Vd,Vn,Vm (vector) +INST2(sqadd, "sqadd", 0, 0, IF_EN2O, 0x5E200C00, 0x0E200C00) + // C7.2.275 SQADD + // sqadd Vd,Vn,Vm DV_3E 01011110XX1mmmmm 000011nnnnnddddd 5E20 0C00 Vd,Vn,Vm (scalar) + // sqadd Vd,Vn,Vm DV_3A 0Q001110XX1mmmmm 000011nnnnnddddd 0E20 0C00 Vd,Vn,Vm (vector) + +INST2(sqsub, "sqsub", 0, 0, IF_EN2O, 0x5E202C00, 0x0E202C00) + // C7.2.299 SQSUB + // sqsub Vd,Vn,Vm DV_3E 01011110XX1mmmmm 001011nnnnnddddd 5E20 2C00 Vd,Vn,Vm (scalar) + // sqsub Vd,Vn,Vm DV_3A 0Q001110XX1mmmmm 001011nnnnnddddd 0E20 2C00 Vd,Vn,Vm (vector) + +INST2(uqadd, "uqadd", 0, 0, IF_EN2O, 0x7E200C00, 0x2E200C00) + // C7.2.364 UQADD + // uqadd Vd,Vn,Vm DV_3E 01111110XX1mmmmm 000011nnnnnddddd 7E20 0C00 Vd,Vn,Vm (scalar) + // uqadd Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 000011nnnnnddddd 2E20 0C00 Vd,Vn,Vm (vector) + +INST2(uqsub, "uqsub", 0, 0, IF_EN2O, 0x7E202C00, 0x2E202C00) + // C7.2.370 UQSUB + // uqsub Vd,Vn,Vm DV_3E 01111110XX1mmmmm 001011nnnnnddddd 7E20 2C00 Vd,Vn,Vm (scalar) + // uqsub Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 001011nnnnnddddd 2E20 2C00 Vd,Vn,Vm (vector) + // enum name FP LD/ST DV_2Q DV_3B INST2(faddp, "faddp", 0, 0, IF_EN2P, 0x7E30D800, 0x2E20D400) // faddp Vd,Vn DV_2Q 011111100X110000 110110nnnnnddddd 7E30 D800 Vd,Vn (scalar) @@ -1197,9 +1293,6 @@ INST1(madd, "madd", 0, 0, IF_DR_4A, 0x1B000000) INST1(msub, "msub", 0, 0, IF_DR_4A, 0x1B008000) // msub Rd,Rn,Rm,Ra DR_4A X0011011000mmmmm 1aaaaannnnnddddd 1B00 8000 -INST1(smull, "smull", 0, 0, IF_DR_3A, 0x9B207C00) - // smull Rd,Rn,Rm DR_3A 10011011001mmmmm 011111nnnnnddddd 9B20 7C00 - INST1(smaddl, "smaddl", 0, 0, IF_DR_4A, 0x9B200000) // smaddl Rd,Rn,Rm,Ra DR_4A 10011011001mmmmm 0aaaaannnnnddddd 9B20 0000 @@ -1212,9 +1305,6 @@ INST1(smsubl, "smsubl", 0, 0, IF_DR_4A, 0x9B208000) INST1(smulh, "smulh", 0, 0, IF_DR_3A, 0x9B407C00) // smulh Rd,Rn,Rm DR_3A 10011011010mmmmm 011111nnnnnddddd 9B40 7C00 -INST1(umull, "umull", 0, 0, IF_DR_3A, 0x9BA07C00) - // umull Rd,Rn,Rm DR_3A 10011011101mmmmm 011111nnnnnddddd 9BA0 7C00 - INST1(umaddl, "umaddl", 0, 0, IF_DR_4A, 0x9BA00000) // umaddl Rd,Rn,Rm,Ra DR_4A 10011011101mmmmm 0aaaaannnnnddddd 9BA0 0000 @@ -1525,6 +1615,182 @@ INST1(frecpx, "frecpx", 0, 0, IF_DV_2G, 0x5EA1F800) // C7.2.139 FRECPX // frecpx Vd,Vn DV_2G 010111101X100001 111110nnnnnddddd 5EA1 F800 Vd,Vn (scalar) +INST1(addhn, "addhn", 0, 0, IF_DV_3H, 0x0E204000) + // C7.2.3 ADDHN, ADDHN2 + // addhn Vd,Vn,Vm DV_3H 00001110XX1mmmmm 010000nnnnnddddd 0E20 4000 Vd,Vn,Vm (vector) + +INST1(addhn2, "addhn2", 0, 0, IF_DV_3H, 0x4E204000) + // C7.2.3 ADDHN, ADDHN2 + // addhn2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 010000nnnnnddddd 4E20 4000 Vd,Vn,Vm (vector) + +INST1(pmull, "pmull", 0, 0, IF_DV_3H, 0x0E20E000) + // C7.2.208 PMULL, PMULL2 + // pmull Vd,Vn,Vm DV_3H 00001110XX1mmmmm 111000nnnnnddddd 0E20 E000 Vd,Vn,Vm (vector) + +INST1(pmull2, "pmull2", 0, 0, IF_DV_3H, 0x4E20E000) + // C7.2.208 PMULL, PMULL2 + // pmull2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 111000nnnnnddddd 4E20 E000 Vd,Vn,Vm (vector) + +INST1(raddhn, "raddhn", 0, 0, IF_DV_3H, 0x2E204000) + // C7.2.209 RADDHN, RADDHN2 + // raddhn Vd,Vn,Vm DV_3H 00101110XX1mmmmm 010000nnnnnddddd 2E20 4000 Vd,Vn,Vm (vector) + +INST1(raddhn2, "raddhn2",0, 0, IF_DV_3H, 0x6E204000) + // C7.2.209 RADDHN, RADDHN2 + // raddhn2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 010000nnnnnddddd 6E20 4000 Vd,Vn,Vm (vector) + +INST1(rsubhn, "rsubhn", 0, 0, IF_DV_3H, 0x2E206000) + // C7.2.216 RSUBHN, RSUBHN2 + // rsubhn Vd,Vn,Vm DV_3H 00101110XX1mmmmm 011000nnnnnddddd 2E20 6000 Vd,Vn,Vm (vector) + +INST1(rsubhn2, "rsubhn2",0, 0, IF_DV_3H, 0x6E206000) + // C7.2.216 RSUBHN, RSUBHN2 + // rsubhn2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 011000nnnnnddddd 6E20 6000 Vd,Vn,Vm (vector) + +INST1(sabal, "sabal", 0, 0, IF_DV_3H, 0x0E205000) + // C7.2.218 SABAL, SABAL2 + // sabal Vd,Vn,Vm DV_3H 00001110XX1mmmmm 010100nnnnnddddd 0E20 5000 Vd,Vn,Vm (vector) + +INST1(sabal2, "sabal2", 0, 0, IF_DV_3H, 0x4E205000) + // C7.2.218 SABAL, SABAL2 + // sabal2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 010100nnnnnddddd 4E20 5000 Vd,Vn,Vm (vector) + +INST1(sabdl, "sabdl", 0, 0, IF_DV_3H, 0x0E207000) + // C7.2.220 SABDL, SABDL2 + // sabdl Vd,Vn,Vm DV_3H 00001110XX1mmmmm 011100nnnnnddddd 0E20 7000 Vd,Vn,Vm (vector) + +INST1(sabdl2, "sabdl2", 0, 0, IF_DV_3H, 0x4E207000) + // C7.2.220 SABDL, SABDL2 + // sabdl2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 011100nnnnnddddd 4E20 7000 Vd,Vn,Vm (vector) + +INST1(sadalp, "sadalp", 0, 0, IF_DV_2T, 0x0E206800) + // C7.2.221 SADALP + // sadalp Vd,Vn DV_2T 0Q001110XX100000 011010nnnnnddddd 0E20 6800 Vd,Vn (vector) + +INST1(saddl, "saddl", 0, 0, IF_DV_3H, 0x0E200000) + // C7.2.222 SADDL, SADDL2 + // saddl Vd,Vn,Vm DV_3H 00001110XX1mmmmm 000000nnnnnddddd 0E20 0000 Vd,Vn,Vm (vector) + +INST1(saddl2, "saddl2", 0, 0, IF_DV_3H, 0x4E200000) + // C7.2.222 SADDL, SADDL2 + // saddl2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 000000nnnnnddddd 4E20 0000 Vd,Vn,Vm (vector) + +INST1(saddlp, "saddlp", 0, 0, IF_DV_2T, 0x0E202800) + // C7.2.223 SADDLP + // saddlp Vd,Vn DV_2T 0Q001110XX100000 001010nnnnnddddd 0E20 2800 Vd,Vn (vector) + +INST1(saddw, "saddw", 0, 0, IF_DV_3H, 0x0E201000) + // C7.2.225 SADDW, SADDW2 + // saddw Vd,Vn,Vm DV_3H 00001110XX1mmmmm 000100nnnnnddddd 0E20 1000 Vd,Vn,Vm (vector) + +INST1(saddw2, "saddw2", 0, 0, IF_DV_3H, 0x4E201000) + // C7.2.225 SADDW, SADDW2 + // saddw2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 000100nnnnnddddd 4E20 1000 Vd,Vn,Vm (vector) + +INST1(shadd, "shadd", 0, 0, IF_DV_3A, 0x0E200400) + // C7.2.246 SHADD + // shadd Vd,Vn,Vm DV_3A 0Q001110XX1mmmmm 000001nnnnnddddd 0E20 0400 Vd,Vn,Vm (vector) + +INST1(shsub, "shsub", 0, 0, IF_DV_3A, 0x0E202400) + // C7.2.250 SHSUB + // shsub Vd,Vn,Vm DV_3A 0Q001110XX1mmmmm 001001nnnnnddddd 0E20 2400 Vd,Vn,Vm (vector) + +INST1(srhadd, "srhadd", 0, 0, IF_DV_3A, 0x0E201400) + // C7.2.302 SRHADD + // srhadd Vd,Vn,Vm DV_3A 0Q001110XX1mmmmm 000101nnnnnddddd 0E20 1400 Vd,Vn,Vm (vector) + +INST1(ssubl, "ssubl", 0, 0, IF_DV_3H, 0x0E202000) + // C7.2.311 SSUBL, SSUBL2 + // ssubl Vd,Vn,Vm DV_3H 00001110XX1mmmmm 001000nnnnnddddd 0E20 2000 Vd,Vn,Vm (vector) + +INST1(ssubl2, "ssubl2", 0, 0, IF_DV_3H, 0x4E202000) + // C7.2.311 SSUBL, SSUBL2 + // ssubl2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 001000nnnnnddddd 4E20 2000 Vd,Vn,Vm (vector) + +INST1(ssubw, "ssubw", 0, 0, IF_DV_3H, 0x0E203000) + // C7.2.312 SSUBW, SSUBW2 + // ssubw Vd,Vn,Vm DV_3H 00001110XX1mmmmm 001100nnnnnddddd 0E20 3000 Vd,Vn,Vm (vector) + +INST1(ssubw2, "ssubw2", 0, 0, IF_DV_3H, 0x4E203000) + // C7.2.312 SSUBW, SSUBW2 + // ssubw2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 001100nnnnnddddd 4E20 3000 Vd,Vn,Vm (vector) + +INST1(subhn, "subhn", 0, 0, IF_DV_3H, 0x0E206000) + // C7.2.327 SUBHN, SUBHN2 + // subhn Vd,Vn,Vm DV_3H 00001110XX1mmmmm 011000nnnnnddddd 0E20 6000 Vd,Vn,Vm (vector) + +INST1(subhn2, "subhn2",0, 0, IF_DV_3H, 0x4E206000) + // C7.2.327 SUBHN, SUBHN2 + // subhn2 Vd,Vn,Vm DV_3H 01001110XX1mmmmm 011000nnnnnddddd 4E20 6000 Vd,Vn,Vm (vector) + +INST1(uabal, "uabal", 0, 0, IF_DV_3H, 0x2E205000) + // C7.2.335 UABAL, UABAL2 + // uabal Vd,Vn,Vm DV_3H 00101110XX1mmmmm 010100nnnnnddddd 2E20 5000 Vd,Vn,Vm (vector) + +INST1(uabal2, "uabal2", 0, 0, IF_DV_3H, 0x6E205000) + // C7.2.335 UABAL, UABAL2 + // uabal2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 010100nnnnnddddd 6E20 5000 Vd,Vn,Vm (vector) + +INST1(uabdl, "uabdl", 0, 0, IF_DV_3H, 0x2E207000) + // C7.2.337 UABDL, UABDL2 + // uabdl Vd,Vn,Vm DV_3H 00101110XX1mmmmm 011100nnnnnddddd 2E20 7000 Vd,Vn,Vm (vector) + +INST1(uabdl2, "uabdl2", 0, 0, IF_DV_3H, 0x6E207000) + // C7.2.337 UABDL, UABDL2 + // uabdl2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 011100nnnnnddddd 6E20 7000 Vd,Vn,Vm (vector) + +INST1(uadalp, "uadalp", 0, 0, IF_DV_2T, 0x2E206800) + // C7.2.338 UADALP + // uadalp Vd,Vn DV_2T 0Q101110XX100000 011010nnnnnddddd 2E20 6800 Vd,Vn (vector) + +INST1(uaddl, "uaddl", 0, 0, IF_DV_3H, 0x2E200000) + // C7.2.339 UADDL, UADDL2 + // uaddl Vd,Vn,Vm DV_3H 00101110XX1mmmmm 000000nnnnnddddd 2E20 0000 Vd,Vn,Vm (vector) + +INST1(uaddl2, "uaddl2", 0, 0, IF_DV_3H, 0x6E200000) + // C7.2.339 UADDL, UADDL2 + // uaddl2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 000000nnnnnddddd 6E20 0000 Vd,Vn,Vm (vector) + +INST1(uaddlp, "uaddlp", 0, 0, IF_DV_2T, 0x2E202800) + // C7.2.340 UADDLP + // uaddlp Vd,Vn DV_2T 0Q101110XX100000 001010nnnnnddddd 2E20 2800 Vd,Vn (vector) + +INST1(uaddw, "uaddw", 0, 0, IF_DV_3H, 0x2E201000) + // C7.2.342 UADDW, UADDW2 + // uaddw Vd,Vn,Vm DV_3H 00101110XX1mmmmm 000100nnnnnddddd 2E20 1000 Vd,Vn,Vm (vector) + +INST1(uaddw2, "uaddw2", 0, 0, IF_DV_3H, 0x6E201000) + // C7.2.342 UADDW, UADDW2 + // uaddw2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 000100nnnnnddddd 6E20 1000 Vd,Vn,Vm (vector) + +INST1(uhadd, "uhadd", 0, 0, IF_DV_3A, 0x2E200400) + // C7.2.349 UHADD + // uhadd Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 000001nnnnnddddd 2E20 0400 Vd,Vn,Vm (vector) + +INST1(uhsub, "uhsub", 0, 0, IF_DV_3A, 0x2E202400) + // C7.2.350 UHSUB + // uhsub Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 001001nnnnnddddd 2E20 2400 Vd,Vn,Vm (vector) + +INST1(urhadd, "urhadd", 0, 0, IF_DV_3A, 0x2E201400) + // C7.2.373 URHADD + // urhadd Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 000101nnnnnddddd 2E20 1400 Vd,Vn,Vm (vector) + +INST1(usubl, "usubl", 0, 0, IF_DV_3H, 0x2E202000) + // C7.2.383 USUBL, USUBL2 + // usubl Vd,Vn,Vm DV_3H 00101110XX1mmmmm 001000nnnnnddddd 2E20 2000 Vd,Vn,Vm (vector) + +INST1(usubl2, "usubl2", 0, 0, IF_DV_3H, 0x6E202000) + // C7.2.383 USUBL, USUBL2 + // usubl2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 001000nnnnnddddd 6E20 2000 Vd,Vn,Vm (vector) + +INST1(usubw, "usubw", 0, 0, IF_DV_3H, 0x2E203000) + // C7.2.384 USUBW, USUBW2 + // usubw Vd,Vn,Vm DV_3H 00101110XX1mmmmm 001100nnnnnddddd 2E20 3000 Vd,Vn,Vm (vector) + +INST1(usubw2, "usubw2", 0, 0, IF_DV_3H, 0x6E203000) + // C7.2.384 USUBW, USUBW2 + // usubw2 Vd,Vn,Vm DV_3H 01101110XX1mmmmm 001100nnnnnddddd 6E20 3000 Vd,Vn,Vm (vector) + INST1(shll, "shll", 0, 0, IF_DV_2M, 0x2F00A400) // shll Vd,Vn,imm DV_2M 0Q101110XX100001 001110nnnnnddddd 2E21 3800 Vd,Vn, {8/16/32}