diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index adbc4147d5e480..e37f24a59be1b0 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -1642,7 +1642,7 @@ void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, ); } - regSet.verifyRegistersUsed(RBM_CALLEE_TRASH); + regSet.verifyRegistersUsed(RBM_CALLEE_TRASH(compiler)); } #ifdef PROFILING_SUPPORTED @@ -1909,7 +1909,7 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni void CodeGen::genPushFltRegs(regMaskTP regMask) { assert(regMask != 0); // Don't call uness we have some registers to push - assert((regMask & RBM_ALLFLOAT) == regMask); // Only floasting point registers should be in regMask + assert((regMask & RBM_ALLFLOAT(compiler)) == regMask); // Only floasting point registers should be in regMask regNumber lowReg = genRegNumFromMask(genFindLowestBit(regMask)); int slots = genCountBits(regMask); @@ -1928,7 +1928,7 @@ void CodeGen::genPushFltRegs(regMaskTP regMask) void CodeGen::genPopFltRegs(regMaskTP regMask) { assert(regMask != 0); // Don't call uness we have some registers to pop - assert((regMask & RBM_ALLFLOAT) == regMask); // Only floasting point registers should be in regMask + assert((regMask & RBM_ALLFLOAT(compiler)) == regMask); // Only floasting point registers should be in regMask regNumber lowReg = genRegNumFromMask(genFindLowestBit(regMask)); int slots = genCountBits(regMask); @@ -2135,7 +2135,7 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog) assert(compiler->compGeneratingEpilog); regMaskTP maskPopRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; - regMaskTP maskPopRegsFloat = maskPopRegs & RBM_ALLFLOAT; + regMaskTP maskPopRegsFloat = maskPopRegs & RBM_ALLFLOAT(compiler); regMaskTP maskPopRegsInt = maskPopRegs & ~maskPopRegsFloat; // First, pop float registers @@ -2295,7 +2295,7 @@ void CodeGen::genFuncletProlog(BasicBlock* block) compiler->unwindBegProlog(); - regMaskTP maskPushRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; + regMaskTP maskPushRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT(compiler); regMaskTP maskPushRegsInt = genFuncletInfo.fiSaveRegs & ~maskPushRegsFloat; regMaskTP maskStackAlloc = genStackAllocRegisterMask(genFuncletInfo.fiSpDelta, maskPushRegsFloat); @@ -2391,7 +2391,7 @@ void CodeGen::genFuncletEpilog() /* The saved regs info saves the LR register. We need to pop the PC register to return */ assert(genFuncletInfo.fiSaveRegs & RBM_LR); - regMaskTP maskPopRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; + regMaskTP maskPopRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT(compiler); regMaskTP maskPopRegsInt = genFuncletInfo.fiSaveRegs & ~maskPopRegsFloat; regMaskTP maskStackAlloc = genStackAllocRegisterMask(genFuncletInfo.fiSpDelta, maskPopRegsFloat); diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 3611924ff7b2b9..5e3c889975cf2d 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -910,7 +910,7 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe // Save integer registers at higher addresses than floating-point registers. - regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT; + regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT(compiler); regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat; if (maskSaveRegsFloat != RBM_NONE) @@ -1027,7 +1027,7 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in // Save integer registers at higher addresses than floating-point registers. - regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT; + regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT(compiler); regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat; // Restore in the opposite order of saving. @@ -1352,7 +1352,7 @@ void CodeGen::genFuncletProlog(BasicBlock* block) compiler->unwindBegProlog(); - regMaskTP maskSaveRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; + regMaskTP maskSaveRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT(compiler); regMaskTP maskSaveRegsInt = genFuncletInfo.fiSaveRegs & ~maskSaveRegsFloat; // Funclets must always save LR and FP, since when we have funclets we must have an FP frame. @@ -1556,7 +1556,7 @@ void CodeGen::genFuncletEpilog() unwindStarted = true; } - regMaskTP maskRestoreRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; + regMaskTP maskRestoreRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT(compiler); regMaskTP maskRestoreRegsInt = genFuncletInfo.fiSaveRegs & ~maskRestoreRegsFloat; // Funclets must always save LR and FP, since when we have funclets we must have an FP frame. @@ -5256,7 +5256,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode) { // Note that we cannot use targetReg before consuming all float source operands. // Therefore use an internal temp register - vectorReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + vectorReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT(compiler)); } // We will first consume the list items in execution (left to right) order, @@ -5655,7 +5655,7 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed) genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN); - if ((genRegMask(initReg) & RBM_PROFILER_ENTER_TRASH) != RBM_NONE) + if ((genRegMask(initReg) & RBM_PROFILER_ENTER_TRASH(compiler)) != RBM_NONE) { *pInitRegZeroed = false; } diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 9fa35607307197..760bbd3117a059 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2634,7 +2634,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) if (shouldUse16ByteWideInstrs) { - const regNumber simdReg = node->GetSingleTempReg(RBM_ALLFLOAT); + const regNumber simdReg = node->GetSingleTempReg(RBM_ALLFLOAT(compiler)); const int initValue = (src->AsIntCon()->IconValue() & 0xFF); emit->emitIns_R_I(INS_movi, EA_16BYTE, simdReg, initValue, INS_OPTS_16B); @@ -2960,8 +2960,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (shouldUse16ByteWideInstrs) { - const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); - const regNumber simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT); + const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT(compiler)); + const regNumber simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT(compiler)); helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); } @@ -3190,7 +3190,7 @@ void CodeGen::genCall(GenTreeCall* call) // We should not have GC pointers in killed registers live around the call. // GC info for arg registers were cleared when consuming arg nodes above // and LSRA should ensure it for other trashed registers. - regMaskTP killMask = RBM_CALLEE_TRASH; + regMaskTP killMask = RBM_CALLEE_TRASH(compiler); if (call->IsHelperCall()) { CorInfoHelpFunc helpFunc = compiler->eeGetHelperNum(call->gtCallMethHnd); @@ -4693,7 +4693,7 @@ void CodeGen::genPushCalleeSavedRegisters() #endif // DEBUG #if defined(TARGET_ARM) - regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT; + regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT(compiler); regMaskTP maskPushRegsInt = rsPushRegs & ~maskPushRegsFloat; maskPushRegsInt |= genStackAllocRegisterMask(compiler->compLclFrameSize, maskPushRegsFloat); @@ -4800,7 +4800,7 @@ void CodeGen::genPushCalleeSavedRegisters() int offset; // This will be the starting place for saving the callee-saved registers, in increasing order. - regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; + regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT(compiler); regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; #ifdef DEBUG diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 1601d9fd566c43..467d7bff5b2753 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -590,19 +590,19 @@ regMaskTP Compiler::compHelperCallKillSet(CorInfoHelpFunc helper) { case CORINFO_HELP_ASSIGN_REF: case CORINFO_HELP_CHECKED_ASSIGN_REF: - return RBM_CALLEE_TRASH_WRITEBARRIER; + return RBM_CALLEE_TRASH_WRITEBARRIER(this); case CORINFO_HELP_ASSIGN_BYREF: - return RBM_CALLEE_TRASH_WRITEBARRIER_BYREF; + return RBM_CALLEE_TRASH_WRITEBARRIER_BYREF(this); case CORINFO_HELP_PROF_FCN_ENTER: - return RBM_PROFILER_ENTER_TRASH; + return RBM_PROFILER_ENTER_TRASH(this); case CORINFO_HELP_PROF_FCN_LEAVE: - return RBM_PROFILER_LEAVE_TRASH; + return RBM_PROFILER_LEAVE_TRASH(this); case CORINFO_HELP_PROF_FCN_TAILCALL: - return RBM_PROFILER_TAILCALL_TRASH; + return RBM_PROFILER_TAILCALL_TRASH(this); #ifdef TARGET_X86 case CORINFO_HELP_ASSIGN_REF_EAX: @@ -622,16 +622,16 @@ regMaskTP Compiler::compHelperCallKillSet(CorInfoHelpFunc helper) #endif case CORINFO_HELP_STOP_FOR_GC: - return RBM_STOP_FOR_GC_TRASH; + return RBM_STOP_FOR_GC_TRASH(this); case CORINFO_HELP_INIT_PINVOKE_FRAME: - return RBM_INIT_PINVOKE_FRAME_TRASH; + return RBM_INIT_PINVOKE_FRAME_TRASH(this); case CORINFO_HELP_VALIDATE_INDIRECT_CALL: return RBM_VALIDATE_INDIRECT_CALL_TRASH; default: - return RBM_CALLEE_TRASH; + return RBM_CALLEE_TRASH(this); } } @@ -3590,18 +3590,18 @@ void CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg, bool* pXtraRegClobbere regMaskTP fpAvailMask; - fpAvailMask = RBM_FLT_CALLEE_TRASH & ~regArgMaskLive; + fpAvailMask = RBM_FLT_CALLEE_TRASH(compiler) & ~regArgMaskLive; if (GlobalJitOptions::compFeatureHfa) { - fpAvailMask &= RBM_ALLDOUBLE; + fpAvailMask &= RBM_ALLDOUBLE(compiler); } if (fpAvailMask == RBM_NONE) { - fpAvailMask = RBM_ALLFLOAT & ~regArgMaskLive; + fpAvailMask = RBM_ALLFLOAT(compiler) & ~regArgMaskLive; if (GlobalJitOptions::compFeatureHfa) { - fpAvailMask &= RBM_ALLDOUBLE; + fpAvailMask &= RBM_ALLDOUBLE(compiler); } } @@ -5246,7 +5246,7 @@ void CodeGen::genFinalizeFrame() // We always save FP. noway_assert(isFramePointerUsed()); #if defined(TARGET_AMD64) || defined(TARGET_ARM64) - regMaskTP okRegs = (RBM_CALLEE_TRASH | RBM_FPBASE | RBM_ENC_CALLEE_SAVED); + regMaskTP okRegs = (RBM_CALLEE_TRASH(compiler) | RBM_FPBASE | RBM_ENC_CALLEE_SAVED); if (RBM_ENC_CALLEE_SAVED != 0) { regSet.rsSetRegsModified(RBM_ENC_CALLEE_SAVED); @@ -5304,7 +5304,7 @@ void CodeGen::genFinalizeFrame() #if defined(TARGET_ARM) // TODO-ARM64-Bug?: enable some variant of this for FP on ARM64? - regMaskTP maskPushRegsFloat = maskCalleeRegsPushed & RBM_ALLFLOAT; + regMaskTP maskPushRegsFloat = maskCalleeRegsPushed & RBM_ALLFLOAT(compiler); regMaskTP maskPushRegsInt = maskCalleeRegsPushed & ~maskPushRegsFloat; if ((maskPushRegsFloat != RBM_NONE) || diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index b2c29c24c7939c..7368018b47675c 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -2332,8 +2332,8 @@ void CodeGen::genEmitCallIndir(int callType, // These should have been put in volatile registers to ensure they do not // get overridden by epilog sequence during tailcall. - noway_assert(!isJump || (iReg == REG_NA) || ((RBM_CALLEE_TRASH & genRegMask(iReg)) != 0)); - noway_assert(!isJump || (xReg == REG_NA) || ((RBM_CALLEE_TRASH & genRegMask(xReg)) != 0)); + noway_assert(!isJump || (iReg == REG_NA) || ((RBM_CALLEE_TRASH(compiler) & genRegMask(iReg)) != 0)); + noway_assert(!isJump || (xReg == REG_NA) || ((RBM_CALLEE_TRASH(compiler) & genRegMask(xReg)) != 0)); GetEmitter()->emitIns_Call(emitter::EmitCallType(callType), methHnd, diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 30a86945145e19..47ac8a757320e2 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2869,7 +2869,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) if (willUseSimdMov) { - regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); + regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT(compiler)); unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) ? YMM_REGSIZE_BYTES @@ -3137,7 +3137,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (size >= XMM_REGSIZE_BYTES) { - regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT); + regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT(compiler)); instruction simdMov = simdUnalignedMovIns(); @@ -3459,7 +3459,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) if (loadSize >= XMM_REGSIZE_BYTES) #endif { - xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT); + xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT(compiler)); } if ((loadSize % XMM_REGSIZE_BYTES) != 0) { @@ -5655,7 +5655,7 @@ void CodeGen::genCall(GenTreeCall* call) // We should not have GC pointers in killed registers live around the call. // GC info for arg registers were cleared when consuming arg nodes above // and LSRA should ensure it for other trashed registers. - regMaskTP killMask = RBM_CALLEE_TRASH; + regMaskTP killMask = RBM_CALLEE_TRASH(compiler); if (call->IsHelperCall()) { CorInfoHelpFunc helpFunc = compiler->eeGetHelperNum(call->gtCallMethHnd); @@ -7910,13 +7910,13 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) intTmpReg = putArgStk->GetSingleTempReg(RBM_ALLINT); assert(genIsValidIntReg(intTmpReg)); } - if ((rsvdRegs & RBM_ALLFLOAT) != 0) + if ((rsvdRegs & RBM_ALLFLOAT(compiler)) != 0) { - simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT); + simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT(compiler)); assert(genIsValidFloatReg(simdTmpReg)); } assert(genCountBits(rsvdRegs) == (unsigned)((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1)); - } + } for (GenTreeFieldList::Use& use : fieldList->Uses()) { @@ -9100,7 +9100,7 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed) } // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using. - if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0) + if ((RBM_CALLEE_TRASH(compiler) & genRegMask(initReg)) != 0) { *pInitRegZeroed = false; } @@ -9137,7 +9137,7 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed) genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET); // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using. - if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0) + if ((RBM_CALLEE_TRASH(compiler) & genRegMask(initReg)) != 0) { *pInitRegZeroed = false; } @@ -9178,7 +9178,7 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaGetDesc(compiler->info.compThisArg)->lvIsInReg()) { regMaskTP thisPtrMask = genRegMask(compiler->lvaGetDesc(compiler->info.compThisArg)->GetRegNum()); - noway_assert((RBM_PROFILER_LEAVE_TRASH & thisPtrMask) == 0); + noway_assert((RBM_PROFILER_LEAVE_TRASH(compiler) & thisPtrMask) == 0); } // At this point return value is computed and stored in RAX or XMM0. diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 2c5dd165e104f9..aff264c40dfeaa 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9055,6 +9055,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // Returns: // `true` if user requests EVEX encoding and it's safe, `false` if not. // +public: bool DoJitStressEvexEncoding() const { #if defined(TARGET_XARCH) && defined(DEBUG) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 771882b6190d69..0f792fcdd3fad8 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -3218,7 +3218,7 @@ emitter::instrDesc* emitter::emitNewInstrCallInd(int argCnt, { emitAttr retSize = (retSizeIn != EA_UNKNOWN) ? retSizeIn : EA_PTRSIZE; - bool gcRefRegsInScratch = ((gcrefRegs & RBM_CALLEE_TRASH) != 0); + bool gcRefRegsInScratch = ((gcrefRegs & RBM_CALLEE_TRASH(emitComp)) != 0); // Allocate a larger descriptor if any GC values need to be saved // or if we have an absurd number of arguments or a large address @@ -3308,7 +3308,7 @@ emitter::instrDesc* emitter::emitNewInstrCallDir(int argCnt, // call returns a two-register-returned struct and the second // register (RDX) is a GCRef or ByRef pointer. - bool gcRefRegsInScratch = ((gcrefRegs & RBM_CALLEE_TRASH) != 0); + bool gcRefRegsInScratch = ((gcrefRegs & RBM_CALLEE_TRASH(emitComp)) != 0); if (!VarSetOps::IsEmpty(emitComp, GCvars) || // any frame GCvars live gcRefRegsInScratch || // any register gc refs live in scratch regs @@ -9724,35 +9724,35 @@ regMaskTP emitter::emitGetGCRegsKilledByNoGCCall(CorInfoHelpFunc helper) { case CORINFO_HELP_ASSIGN_REF: case CORINFO_HELP_CHECKED_ASSIGN_REF: - result = RBM_CALLEE_GCTRASH_WRITEBARRIER; + result = RBM_CALLEE_GCTRASH_WRITEBARRIER(emitComp); break; case CORINFO_HELP_ASSIGN_BYREF: - result = RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF; + result = RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF(emitComp); break; #if !defined(TARGET_LOONGARCH64) case CORINFO_HELP_PROF_FCN_ENTER: - result = RBM_PROFILER_ENTER_TRASH; + result = RBM_PROFILER_ENTER_TRASH(emitComp); break; case CORINFO_HELP_PROF_FCN_LEAVE: #if defined(TARGET_ARM) // profiler scratch remains gc live - result = RBM_PROFILER_LEAVE_TRASH & ~RBM_PROFILER_RET_SCRATCH; + result = RBM_PROFILER_LEAVE_TRASH(emitComp) & ~RBM_PROFILER_RET_SCRATCH; #else - result = RBM_PROFILER_LEAVE_TRASH; + result = RBM_PROFILER_LEAVE_TRASH(emitComp); #endif break; case CORINFO_HELP_PROF_FCN_TAILCALL: - result = RBM_PROFILER_TAILCALL_TRASH; + result = RBM_PROFILER_TAILCALL_TRASH(emitComp); break; #endif // !defined(TARGET_LOONGARCH64) #if defined(TARGET_X86) case CORINFO_HELP_INIT_PINVOKE_FRAME: - result = RBM_INIT_PINVOKE_FRAME_TRASH; + result = RBM_INIT_PINVOKE_FRAME_TRASH(c); break; #endif // defined(TARGET_X86) @@ -9761,7 +9761,7 @@ regMaskTP emitter::emitGetGCRegsKilledByNoGCCall(CorInfoHelpFunc helper) break; default: - result = RBM_CALLEE_TRASH_NOGC; + result = RBM_CALLEE_TRASH_NOGC(emitComp); break; } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4b7af2f73e31dc..9ac5ed4439e05b 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1121,6 +1121,28 @@ class emitter idAddr()->_idReg4 = reg; assert(reg == idAddr()->_idReg4); } + bool idHasReg3() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD: + case IF_RWR_RRD_RRD_CNS: + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } + bool idHasReg4() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } #endif // defined(TARGET_XARCH) #ifdef TARGET_ARMARCH insOpts idInsOpt() const diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 82c78299efebd3..9fdeac741d2913 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -214,7 +214,7 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - assert((regmask & RBM_CALLEE_TRASH) == 0); + //assert((regmask & RBM_CALLEE_TRASH(emitComp)) == 0); unsigned encodeMask; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 27ad0e287d7205..0752c4960367ec 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -217,10 +217,10 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_phminposuw: case INS_mpsadbw: case INS_pclmulqdq: - case INS_aesdec: - case INS_aesdeclast: case INS_aesenc: case INS_aesenclast: + case INS_aesdec: + case INS_aesdeclast: case INS_aesimc: case INS_aeskeygenassist: case INS_vzeroupper: @@ -260,18 +260,24 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_prefetcht2: case INS_sfence: // Might need new INS_*suffix* instructions for these. - case INS_por: // INS_pord, INS_porq. - case INS_pxor: // INS_pxord, INS_pxorq - case INS_movdqa: // INS_movdqa32, INS_movdqa64. - case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. - case INS_pand: // INS_pandd, INS_pandq. - case INS_pandn: // INS_pandnd, INS_pandnq. - case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. - case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. - case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. - case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. + + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, movdqu16 etc) + // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand type + // i.e., movdqu => movdqu32 etc + // Since we are not using k registers yet, this will have no impact on correctness but will affect things once + // k registers are used (as that is the point of the "break out operand type" of these instructions) + //case INS_movdqa: // INS_movdqa32, INS_movdqa64. + //case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. + //case INS_pand: // INS_pandd, INS_pandq. + //case INS_pandn: // INS_pandnd, INS_pandnq. + //case INS_por: // INS_pord, INS_porq. + //case INS_pxor: // INS_pxord, INS_pxorq + //case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. + //case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. + //case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. + //case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. { return false; } @@ -761,9 +767,11 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) // Return Value: // true if this instruction requires a VEX or EVEX prefix. // -bool emitter::TakesSimdPrefix(instruction ins) const +bool emitter::TakesSimdPrefix(const instrDesc *id) const { - return TakesEvexPrefix(ins) || TakesVexPrefix(ins); + instruction ins = id->idIns(); + + return TakesEvexPrefix(id) || TakesVexPrefix(ins); } //------------------------------------------------------------------------ @@ -785,13 +793,23 @@ bool emitter::TakesSimdPrefix(instruction ins) const // Return Value: // true if this instruction requires a EVEX prefix. // -bool emitter::TakesEvexPrefix(instruction ins) const +bool emitter::TakesEvexPrefix(const instrDesc *id) const { if (!emitComp->DoJitStressEvexEncoding()) { return false; } + instruction ins = id->idIns(); + + if (HasHighSIMDReg(id)) + { + assert(IsEvexEncodedInstruction(ins)); + // TODO-XARCH-AVX512 remove this check once k registers have been implemented + assert(!HasKMaskRegisterDest(ins)); + return true; + } + // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); } @@ -1059,6 +1077,35 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) #endif //! TARGET_AMD64 } +// Returns true if using this register will require an EVEX.R', EVEX.V' or EVEX.X bit. +bool emitter::HasHighSIMDReg(const instrDesc *id) const +{ +#if defined(TARGET_AMD64) + if (IsHighSIMDReg(id->idReg1()) || IsHighSIMDReg(id->idReg2())) + return true; + + if (id->idIsSmallDsc()) + return false; + + if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || + (id->idHasReg4() && IsHighSIMDReg(id->idReg4()))) + return true; +#endif + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +} + +// Returns true if using this register will require an EVEX.R', EVEX.V' or EVEX.X bit. +bool emitter::IsHighSIMDReg(regNumber reg) const +{ +#ifdef TARGET_AMD64 + return ((reg >= REG_XMM16) && (reg <= REG_XMM31)); +#else + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +#endif +} + // Returns true if using this register will require a REX.* prefix. // Since XMM registers overlap with YMM registers, this routine // can also be used to know whether a YMM register if the @@ -1066,7 +1113,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) bool IsExtendedReg(regNumber reg) { #ifdef TARGET_AMD64 - return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM15)); + return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM31)); #else // X86 JIT operates in 32-bit mode and hence extended reg are not available. return false; @@ -1078,7 +1125,7 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) { #ifdef TARGET_AMD64 // Not a register, so doesn't need a prefix - if (reg > REG_XMM15) + if (reg > REG_XMM31) { return false; } @@ -1119,12 +1166,19 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) bool IsXMMReg(regNumber reg) { #ifdef TARGET_AMD64 - return (reg >= REG_XMM0) && (reg <= REG_XMM15); + return (reg >= REG_XMM0) && (reg <= REG_XMM31); #else // !TARGET_AMD64 return (reg >= REG_XMM0) && (reg <= REG_XMM7); #endif // !TARGET_AMD64 } +// Returns bits to be encoded in instruction for the given register +unsigned HighAwareRegEncoding(regNumber reg) +{ + static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE"); + return (unsigned)(reg & 0xF); +} + // Returns bits to be encoded in instruction for the given register. unsigned RegEncoding(regNumber reg) { @@ -1135,11 +1189,13 @@ unsigned RegEncoding(regNumber reg) // Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes // SSE2: separate 1-byte prefix gets added before opcode. // AVX: specific bits within VEX prefix need to be set in bit-inverted form. -emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexWPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // W-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1169,11 +1225,13 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) #ifdef TARGET_AMD64 -emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexRPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // R-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1197,11 +1255,13 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) return code | 0x4400000000ULL; } -emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexXPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { // X-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1224,11 +1284,13 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) return code | 0x4200000000ULL; } -emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexBPrefix(const instrDesc *id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // B-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1260,6 +1322,19 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) return code | 0x4000000000ULL; } +emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) +{ + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); +} + +emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) +{ + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); +} + + #endif // TARGET_AMD64 bool isPrefix(BYTE b) @@ -1962,15 +2037,18 @@ unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr) // Returns: // Updated size. // -unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code) +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +unsigned emitter::emitGetAdjustedSizeEvexAware(const instrDesc *id, emitAttr attr, code_t code) { unsigned adjustedSize = 0; + instruction ins = id->idIns(); // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless // explicitly // asked for EVEX. - if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(ins)) + if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(id)) { // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always @@ -2669,10 +2747,12 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) * part of an opcode. */ -inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg012(const instrDesc *id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2681,7 +2761,14 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexBPrefix(ins, *code); // REX.B + if (IsHighSIMDReg(reg)) + { + *code = AddRexXPrefix(id, *code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexBPrefix(id, *code); // REX.B + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2703,10 +2790,12 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt * part of an opcode. */ -inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg345(const instrDesc *id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2715,7 +2804,14 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexRPrefix(ins, *code); // REX.R + if (IsHighSIMDReg(reg)) + { + *code = AddEvexRPrimePrefix(*code); // EVEX.R' + } + if (reg & 0x8) + { + *code = AddRexRPrefix(id, *code); // REX.R + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2736,8 +2832,10 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt * Returns modified SIMD opcode with the specified register encoded in bits 3-6 of * byte 2 of VEX and EVEX prefix. */ -inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeReg3456(const instrDesc *id, regNumber reg, emitAttr size, code_t code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); assert(IsVexOrEvexEncodedInstruction(ins)); assert(hasVexOrEvexPrefix(code)); @@ -2755,10 +2853,20 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, assert(regBits <= 0xF); if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) { - assert(hasEvexPrefix(code) && TakesEvexPrefix(ins)); + assert(hasEvexPrefix(code) && TakesEvexPrefix(id)); + // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. + // Rather see these paths cleaned up. + regBits = HighAwareRegEncoding(reg); +#if defined(TARGET_AMD64) + if (IsHighSIMDReg(reg)) + { + // Have to set the EVEX V' bit + code = AddEvexVPrimePrefix(code); + } +#endif // Shift count = 5-bytes of opcode + 0-2 bits for EVEX regBits <<= 43; return code ^ regBits; @@ -2766,6 +2874,11 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, } if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { + + + // Both prefix encodes register operand in 1's complement form + assert(regBits <= 0xF); + if (TakesVexPrefix(ins)) { assert(hasVexPrefix(code)); @@ -2785,8 +2898,10 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, * Used exclusively to generate the REX.X bit and truncate the register. */ -inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* code) +inline unsigned emitter::insEncodeRegSIB(const instrDesc *id, regNumber reg, code_t* code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); #ifdef TARGET_AMD64 @@ -2797,7 +2912,14 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* if (IsExtendedReg(reg)) { - *code = AddRexXPrefix(ins, *code); // REX.X + if (IsHighSIMDReg(reg)) + { + *code = AddEvexVPrimePrefix(*code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexXPrefix(id, *code); // REX.B + } } unsigned regBits = RegEncoding(reg); #else // !TARGET_AMD64 @@ -2813,7 +2935,7 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* * Returns the "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2831,7 +2953,7 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) * Returns the given "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeRMreg(const instrDesc *id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2849,11 +2971,11 @@ inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) * the given register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2864,11 +2986,11 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, e * the given register. */ -inline emitter::code_t emitter::insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMIreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2889,13 +3011,13 @@ inline bool insNeedsRRIb(instruction ins) * Returns the "reg,reg,imm8" opcode with both the reg's set to the * the given register. */ -inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeRRIb(const instrDesc *id, regNumber reg, emitAttr size) { assert(size == EA_4BYTE); // All we handle for now. - assert(insNeedsRRIb(ins)); + assert(insNeedsRRIb(id->idIns())); // If this list gets longer, use a switch, or a table lookup. code_t code = 0x69c0; - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // We use the same register as source and destination. (Could have another version that does both regs...) code |= regcode; code |= (regcode << 3); @@ -2908,10 +3030,10 @@ inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, em * nibble of the opcode */ -inline emitter::code_t emitter::insEncodeOpreg(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeOpreg(const instrDesc *id, regNumber reg, emitAttr size) { - code_t code = insCodeRR(ins); - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + code_t code = insCodeRR(id->idIns()); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; return code; } @@ -3082,7 +3204,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code) instruction ins = id->idIns(); emitAttr attr = id->idOpSize(); - UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, attr, code); + UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(id, attr, code); bool includeRexPrefixSize = true; // REX prefix @@ -3141,8 +3263,10 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val return valSize + emitInsSizeRR(id, code); } -inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr) +inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc *id, regNumber reg1, regNumber reg2, emitAttr attr) { + instruction ins = id->idIns(); + emitAttr size = EA_SIZE(attr); // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes. @@ -3150,7 +3274,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re // This would probably be better expressed as a different format or something? code_t code = insCodeRM(ins); - UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, size, insCodeRM(ins)); + UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(id, size, insCodeRM(ins)); bool includeRexPrefixSize = true; // REX prefix @@ -3170,7 +3294,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, re } else { - sz += emitInsSize(insEncodeRMreg(ins, code), includeRexPrefixSize); + sz += emitInsSize(insEncodeRMreg(id, code), includeRexPrefixSize); } return sz; @@ -3299,7 +3423,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { bool compressedFitsInByte = false; TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); @@ -3343,7 +3467,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); } @@ -3372,7 +3496,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var assert(id->idIns() != INS_invalid); instruction ins = id->idIns(); emitAttr attrSize = id->idOpSize(); - UNATIVE_OFFSET prefix = emitGetAdjustedSizeEvexAware(ins, attrSize, code); + UNATIVE_OFFSET prefix = emitGetAdjustedSizeEvexAware(id, attrSize, code); // REX prefix if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) || @@ -3390,7 +3514,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var instruction ins = id->idIns(); emitAttr attrSize = id->idOpSize(); UNATIVE_OFFSET valSize = EA_SIZE_IN_BYTES(attrSize); - UNATIVE_OFFSET prefix = emitGetAdjustedSizeEvexAware(ins, attrSize, code); + UNATIVE_OFFSET prefix = emitGetAdjustedSizeEvexAware(id, attrSize, code); bool valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test); #ifdef TARGET_AMD64 @@ -3496,7 +3620,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -3523,7 +3647,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) size = 2; } - size += emitGetAdjustedSizeEvexAware(ins, attrSize, code); + size += emitGetAdjustedSizeEvexAware(id, attrSize, code); if (hasRexPrefix(code)) { @@ -3741,7 +3865,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code) // can be reached via RIP-relative addressing. UNATIVE_OFFSET size = sizeof(INT32); - size += emitGetAdjustedSizeEvexAware(ins, attrSize, code); + size += emitGetAdjustedSizeEvexAware(id, attrSize, code); bool includeRexPrefixSize = true; @@ -3983,7 +4107,7 @@ void emitter::emitIns(instruction ins, emitAttr attr) insFormat fmt = IF_NONE; - sz += emitGetAdjustedSizeEvexAware(ins, attr, code); + sz += emitGetAdjustedSizeEvexAware(id, attr, code); if (TakesRexWPrefix(ins, attr)) { sz += emitGetRexPrefixSize(ins); @@ -5100,7 +5224,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) /* We expect this to always be a 'big' opcode */ - assert(insEncodeMRreg(ins, reg, attr, insCodeMR(ins)) & 0x00FF0000); + assert(insEncodeMRreg(id, reg, attr, insCodeMR(ins)) & 0x00FF0000); size = attr; @@ -5120,7 +5244,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) id->idReg1(reg); // Vex bytes - sz += emitGetAdjustedSizeEvexAware(ins, attr, insEncodeMRreg(ins, reg, attr, insCodeMR(ins))); + sz += emitGetAdjustedSizeEvexAware(id, attr, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); // REX byte if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr)) @@ -5262,7 +5386,12 @@ void emitter::emitIns_R_I(instruction ins, break; } - sz += emitGetAdjustedSizeEvexAware(ins, attr, insCodeMI(ins)); + id = emitNewInstrSC(attr, val); + id->idIns(ins); + id->idInsFmt(fmt); + id->idReg1(reg); + + sz += emitGetAdjustedSizeEvexAware(id, attr, insCodeMI(ins)); // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target @@ -5272,10 +5401,6 @@ void emitter::emitIns_R_I(instruction ins, sz += emitGetRexPrefixSize(ins); } - id = emitNewInstrSC(attr, val); - id->idIns(ins); - id->idInsFmt(fmt); - id->idReg1(reg); id->idCodeSize(sz); #ifdef DEBUG @@ -5854,13 +5979,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN return; } - UNATIVE_OFFSET sz = emitInsSizeRR(ins, dstReg, srcReg, attr); - instrDesc* id = emitNewInstrSmall(attr); id->idIns(ins); id->idInsFmt(fmt); id->idReg1(dstReg); id->idReg2(srcReg); + + UNATIVE_OFFSET sz = emitInsSizeRR(id, dstReg, srcReg, attr); id->idCodeSize(sz); dispIns(id); @@ -5885,8 +6010,6 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum assert(size <= EA_32BYTE); noway_assert(emitVerifyEncodable(ins, size, reg1, reg2)); - UNATIVE_OFFSET sz = emitInsSizeRR(ins, reg1, reg2, attr); - /* Special case: "XCHG" uses a different format */ insFormat fmt = (ins == INS_xchg) ? IF_RRW_RRW : emitInsModeFormat(ins, IF_RRD_RRD); @@ -5895,6 +6018,8 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum id->idInsFmt(fmt); id->idReg1(reg1); id->idReg2(reg2); + + UNATIVE_OFFSET sz = emitInsSizeRR(id, reg1, reg2, attr); id->idCodeSize(sz); dispIns(id); @@ -8965,7 +9090,7 @@ void emitter::emitIns_Call(EmitCallType callType, { // Tailcall with addressing mode/register needs to be rex.w // prefixed to be recognized as part of epilog by unwinder. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } sz = emitInsSizeAM(id, code); @@ -11324,13 +11449,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { // tail call with addressing mode (or through register) needs rex.w // prefix to be recognized by unwinder as part of epilog. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case: call via a register if (id->idIsCallRegPtr()) { - code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code); + code = insEncodeMRreg(id, reg, EA_PTRSIZE, code); dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code); goto DONE; @@ -11344,14 +11469,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute the REX prefix if it exists if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11397,7 +11522,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_ARD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -11406,7 +11531,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_AWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -11423,10 +11548,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit SIMD prefix if required // There are some callers who already add SIMD prefix and call this routine. // Therefore, add SIMD prefix is one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // For this format, moves do not support a third operand, so we only need to handle the binary ops. - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -11451,33 +11576,33 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } // Emit the REX prefix if required // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. - // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently - // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doind so currently + // since we cannot differentiate EVEX vs VEX without 'code' untill all paths have EVEX support. + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11517,7 +11642,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } } } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -11647,7 +11772,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -11882,7 +12007,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr); + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr); // Is there a displacement? if (dspIsZero) @@ -11912,7 +12037,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) << 8; + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) << 8; // Is there a displacement? if (dspIsZero) @@ -11958,8 +12083,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (reg != REG_NA) { // The address is "[reg + {2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12025,8 +12150,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[{2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, REG_EBP, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, REG_EBP, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12055,7 +12180,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[reg+rgx+dsp]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12284,16 +12409,16 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Add VEX or EVEX prefix if required. // There are some callers who already add prefix and call this routine. // Therefore, add VEX or EVEX prefix if one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). // Not doing so currently since we cannot differentiate EVEX vs VEX without // 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case emitting AVX instructions @@ -12320,9 +12445,9 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -12453,7 +12578,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this int dspAsByte = dsp; - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12507,7 +12632,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12733,12 +12858,12 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute VEX/EVEX prefix // Some of its callers already add EVEX/VEX prefix and then call this routine. // Therefore add EVEX/VEX prefix is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // `addc` is used for two kinds if instructions @@ -12773,7 +12898,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_MRD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -12782,7 +12907,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_MWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -12820,9 +12945,9 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -13205,13 +13330,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -13225,7 +13350,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); } - dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(ins, reg, size, nullptr)); + dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(id, reg, size, nullptr)); } break; @@ -13235,9 +13360,9 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) case INS_push_hide: assert(size == EA_PTRSIZE); - code = insEncodeOpreg(ins, reg, size); + code = insEncodeOpreg(id, reg, size); - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); assert(!TakesRexWPrefix(ins, size)); // Output the REX prefix @@ -13257,13 +13382,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insCodeRR(ins); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -13292,7 +13417,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); assert(size == EA_1BYTE); - code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, EA_1BYTE, insCodeMR(ins)); // Output the REX prefix dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -13318,7 +13443,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); - code = insEncodeMRreg(ins, reg, size, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); if (size != EA_1BYTE) { @@ -13332,11 +13457,11 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) } } - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output the REX prefix @@ -13423,36 +13548,36 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code = insCodeMR(ins); } - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if ((ins == INS_movsx) || (ins == INS_movzx) || (insIsCMOV(ins))) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code) | (int)(size == EA_2BYTE); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code) | (int)(size == EA_2BYTE); #ifdef TARGET_AMD64 assert((size < EA_4BYTE) || (insIsCMOV(ins))); if ((size == EA_8BYTE) || (ins == INS_movsx)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if (ins == INS_movsxd) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); #endif // TARGET_AMD64 } @@ -13462,8 +13587,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); if ((ins == INS_crc32) && (size > EA_1BYTE)) { code |= 0x0100; @@ -13476,15 +13601,15 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } else if (size == EA_8BYTE) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } #endif // FEATURE_HW_INTRINSICS else { - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); code = insCodeMR(ins); - code = insEncodeMRreg(ins, code); + code = insEncodeMRreg(id, code); if (ins != INS_test) { @@ -13514,7 +13639,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) // Don't need to zero out the high bits explicitly if ((ins != INS_xor) || (reg1 != reg2)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } else { @@ -13551,10 +13676,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } - unsigned regCode = insEncodeReg345(ins, regFor345Bits, size, &code); - regCode |= insEncodeReg012(ins, regFor012Bits, size, &code); + unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code); + regCode |= insEncodeReg012(id, regFor012Bits, size, &code); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // In case of AVX instructions that take 3 operands, we generally want to encode reg1 // as first source. In this case, reg1 is both a source and a destination. @@ -13566,12 +13691,12 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) if (IsDstDstSrcAVXInstruction(ins)) { // encode source/dest operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg1, size, code); + code = insEncodeReg3456(id, reg1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg2, size, code); + code = insEncodeReg3456(id, reg2, size, code); } } @@ -13813,21 +13938,21 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) emitAttr size = id->idOpSize(); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - code = insEncodeRMreg(ins, code); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - unsigned regCode = insEncodeReg345(ins, targetReg, size, &code); - regCode |= insEncodeReg012(ins, src2, size, &code); + unsigned regCode = insEncodeReg345(id, targetReg, size, &code); + regCode |= insEncodeReg012(id, src2, size, &code); // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); // Output the REX/VEX/EVEX prefix dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -13921,17 +14046,17 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // Get the 'base' opcode. code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); assert(code & 0x00FF0000); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // The 'vvvv' bits encode the destination register, which for this case (RI) // is the same as the source. - code = insEncodeReg3456(ins, reg, size, code); + code = insEncodeReg3456(id, reg, size, code); } - unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8; + unsigned regcode = (insEncodeReg345(id, regOpcode, size, &code) | insEncodeReg012(id, reg, size, &code)) << 8; // Output the REX prefix dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -13959,15 +14084,15 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) assert(code < 0x100); code |= 0x08; // Set the 'w' bit - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -14059,13 +14184,13 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // r/m, immed form, but do have a dstReg,srcReg,imm8 form. if (valInByte && useSigned && insNeedsRRIb(ins)) { - code = insEncodeRRIb(ins, reg, size); + code = insEncodeRRIb(id, reg, size); } else { code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); } } @@ -14089,7 +14214,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) /* Set the 'w' bit to get the large version */ /* and the REX.W bit to get the really large version */ - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); code |= 0x1; break; #endif @@ -14298,9 +14423,9 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) } else { - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); } @@ -14608,7 +14733,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) idAmd->idCodeSize(sz); code = insCodeRM(ins); - code |= (insEncodeReg345(ins, id->idReg1(), EA_PTRSIZE, &code) << 8); + code |= (insEncodeReg345(id, id->idReg1(), EA_PTRSIZE, &code) << 8); dst = emitOutputAM(dst, idAmd, code, nullptr); @@ -14709,7 +14834,7 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) // ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) { - assert(TakesEvexPrefix(id->idIns())); + assert(TakesEvexPrefix(id)); insTupleType tt = insTupleTypeInfo(id->idIns()); assert(hasTupleTypeInfo(id->idIns())); @@ -14920,12 +15045,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #ifdef TARGET_AMD64 // Support only scalar AVX instructions and hence size is hard coded to 4-byte. - code = AddSimdPrefixIfNeeded(ins, code, EA_4BYTE); + code = AddSimdPrefixIfNeeded(id, code, EA_4BYTE); if (((ins == INS_cdq) || (ins == INS_cwde)) && - (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))) + (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id)))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); #endif @@ -15199,8 +15324,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_SHF: code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, id->idReg1(), size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, id->idReg1(), size, code); // set the W bit if (size != EA_1BYTE) @@ -15209,9 +15334,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Emit the REX prefix if it exists - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output a size prefix for a 16-bit operand @@ -15267,8 +15392,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, code); mReg = id->idReg1(); rReg = id->idReg2(); } @@ -15277,7 +15402,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeMI(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); assert((code & 0xC000) == 0); code |= 0xC000; @@ -15291,19 +15416,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); mReg = id->idReg2(); rReg = id->idReg1(); } assert(code & 0x00FF0000); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -15313,17 +15438,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // (Though we will need to handle the few ops that can have the 'vvvv' bits as destination, // e.g. pslldq, when/if we support those instructions with 2 registers.) // (see x64 manual Table 2-9. Instructions with a VEX.vvvv destination) - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // This is a "merge" move instruction. // Encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } - regcode = (insEncodeReg345(ins, rReg, size, &code) | insEncodeReg012(ins, mReg, size, &code)); + regcode = (insEncodeReg345(id, rReg, size, &code) | insEncodeReg012(id, mReg, size, &code)); // Output the REX prefix dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); @@ -15438,8 +15563,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -15466,8 +15591,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } sz = emitSizeOfInsDsc(id); @@ -15495,8 +15620,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15507,8 +15632,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD: case IF_ARW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15516,7 +15641,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD_RRD: { code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); dst = emitOutputAM(dst, id, code); sz = emitSizeOfInsDsc(id); break; @@ -15606,7 +15731,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15617,10 +15742,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15641,15 +15766,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } @@ -15662,8 +15787,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15674,7 +15799,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } break; @@ -15688,8 +15813,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15700,7 +15825,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15712,7 +15837,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SWR_RRD: case IF_SRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15723,10 +15848,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); break; @@ -15760,7 +15885,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15771,10 +15896,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } @@ -15805,15 +15930,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } @@ -15827,8 +15952,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15838,7 +15963,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } sz = emitSizeOfInsDsc(id); @@ -15853,8 +15978,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15864,7 +15989,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15873,7 +15998,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_MRD_OFF: code = insCode(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15884,10 +16009,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = insEncodeReg012(id->idIns(), id->idReg1(), size, &code); + regcode = insEncodeReg012(id, id->idReg1(), size, &code); dst = emitOutputCV(dst, id, code | 0x30 | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15896,7 +16021,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_MWR_RRD: case IF_MRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15907,10 +16032,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); sz = emitSizeOfInsDsc(id); break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 13f79bc3833cca..802849ca805b54 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -44,7 +44,7 @@ UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp); UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val); UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code); UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code, int val); -UNATIVE_OFFSET emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr); +UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, regNumber reg1, regNumber reg2, emitAttr attr); UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code); UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val); UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code); @@ -72,18 +72,18 @@ unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr); unsigned emitGetEvexPrefixSize(instruction ins); unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize); unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code); -unsigned emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code); +unsigned emitGetAdjustedSizeEvexAware(const instrDesc *id, emitAttr attr, code_t code); -unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code); -unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code); -code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code); -unsigned insEncodeRegSIB(instruction ins, regNumber reg, code_t* code); +unsigned insEncodeReg012(const instrDesc *id, regNumber reg, emitAttr size, code_t* code); +unsigned insEncodeReg345(const instrDesc *id, regNumber reg, emitAttr size, code_t* code); +code_t insEncodeReg3456(const instrDesc *id, regNumber reg, emitAttr size, code_t code); +unsigned insEncodeRegSIB(const instrDesc *id, regNumber reg, code_t* code); -code_t insEncodeMRreg(instruction ins, code_t code); -code_t insEncodeRMreg(instruction ins, code_t code); -code_t insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code); -code_t insEncodeRRIb(instruction ins, regNumber reg, emitAttr size); -code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size); +code_t insEncodeMRreg(const instrDesc *id, code_t code); +code_t insEncodeRMreg(const instrDesc *id, code_t code); +code_t insEncodeMRreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code); +code_t insEncodeRRIb(const instrDesc *id, regNumber reg, emitAttr size); +code_t insEncodeOpreg(const instrDesc *id, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); @@ -102,14 +102,17 @@ bool IsVexEncodedInstruction(instruction ins) const; bool IsEvexEncodedInstruction(instruction ins) const; bool IsVexOrEvexEncodedInstruction(instruction ins) const; -code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); +code_t insEncodeMIreg(const instrDesc *id, regNumber reg, emitAttr size, code_t code); -code_t AddRexWPrefix(instruction ins, code_t code); -code_t AddRexRPrefix(instruction ins, code_t code); -code_t AddRexXPrefix(instruction ins, code_t code); -code_t AddRexBPrefix(instruction ins, code_t code); +code_t AddRexWPrefix(const instrDesc *id, code_t code); +code_t AddRexRPrefix(const instrDesc *id, code_t code); +code_t AddRexXPrefix(const instrDesc *id, code_t code); +code_t AddRexBPrefix(const instrDesc *id, code_t code); code_t AddRexPrefix(instruction ins, code_t code); +code_t AddEvexVPrimePrefix(code_t code); +code_t AddEvexRPrimePrefix(code_t code); + bool EncodedBySSE38orSSE3A(instruction ins); bool Is4ByteSSEInstruction(instruction ins); static bool IsMovInstruction(instruction ins); @@ -180,13 +183,15 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr // Returns: // `true` if W bit needs to be set to 1. // -bool IsWEvexOpcodeExtension(instruction ins) +bool IsWEvexOpcodeExtension(const instrDesc *id) { - if (!TakesEvexPrefix(ins)) + if (!TakesEvexPrefix(id)) { return false; } + instruction ins = id->idIns(); + switch (ins) { case INS_movq: @@ -485,7 +490,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_MASK 0xFF00000000000000ULL #define EVEX_PREFIX_CODE 0x6200000000000000ULL -bool TakesEvexPrefix(instruction ins) const; +bool TakesEvexPrefix(const instrDesc *id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -513,9 +518,13 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); // // Returns: // code with prefix added. -code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeeded(const instrDesc *id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = AddEvexPrefix(ins, code, size); } @@ -536,11 +545,14 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) // size - operand size // // Returns: -// `true` if code has an Evex prefix. -// -code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size) +// TRUE if code has an Evex prefix. +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc *id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = !hasEvexPrefix(code) ? AddEvexPrefix(ins, code, size) : code; } @@ -551,7 +563,7 @@ code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } -bool TakesSimdPrefix(instruction ins) const; +bool TakesSimdPrefix(const instrDesc *id) const; //------------------------------------------------------------------------ // hasVexOrEvexPrefix: Returns true if the instruction encoding already @@ -1023,4 +1035,8 @@ inline bool HasEmbeddedBroadcast(instrDesc* id) return false; } + +inline bool HasHighSIMDReg(const instrDesc *id) const; +inline bool IsHighSIMDReg(regNumber) const; + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d1f80d8220cc60..42fecfa02a08b2 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19108,6 +19108,22 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) #endif } +bool GenTree::isEvexCompatibleHWIntrinsic(Compiler* comp) +{ + assert(gtOper == GT_HWINTRINSIC); + assert(comp != nullptr); + +// TODO-XARCH-AVX512 remove the ReturnsPerElementMask check once K registers have been properly +// implemented in the register allocator +#if defined(TARGET_XARCH) + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); +#elif defined(TARGET_ARM64) + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); +#else + return false; +#endif +} + GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index de6f900cfafc5b..52da45772e7d89 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1517,6 +1517,7 @@ struct GenTree bool isCommutativeHWIntrinsic() const; bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); + bool isEvexCompatibleHWIntrinsic(Compiler* comp); #else bool isCommutativeHWIntrinsic() const { @@ -1532,6 +1533,11 @@ struct GenTree { return false; } + + bool isEvexCompatibleHWIntrinsic(Compiler* comp) + { + return false; + } #endif // FEATURE_HW_INTRINSICS static bool OperIsCommutative(genTreeOps gtOper) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index b1299df1c1f1cf..fc3f6e9425e667 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -158,6 +158,9 @@ enum HWIntrinsicFlag : unsigned int // contained HW_Flag_MaybeCommutative = 0x80000, + // The intrinsic has EVEX compatible form + HW_Flag_NoEvexSemantics = 0x100000 + #elif defined(TARGET_ARM64) // The intrinsic has an immediate operand // - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant @@ -172,7 +175,10 @@ enum HWIntrinsicFlag : unsigned int HW_Flag_SIMDScalar = 0x1000, // The intrinsic supports some sort of containment analysis - HW_Flag_SupportsContainment = 0x2000 + HW_Flag_SupportsContainment = 0x2000, + + // The intrinsic does not have an EVEX compatible form + HW_Flag_NoEvexSemantics = 0x4000 #else #error Unsupported platform @@ -761,6 +767,18 @@ struct HWIntrinsicInfo #endif } + static bool HasEvexSemantics(NamedIntrinsic id) + { + HWIntrinsicFlag flags = lookupFlags(id); +#if defined(TARGET_XARCH) + return (flags & HW_Flag_NoEvexSemantics) == 0; +#elif defined(TARGET_ARM64) + return (flags & HW_Flag_NoEvexSemantics) == 0; +#else +#error Unsupported platform +#endif + } + static bool HasSpecialImport(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index b2b08c8d828c85..66201f0f37ac14 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1527,7 +1527,7 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) regNumber op2Reg = op2->GetRegNum(); regNumber addrBaseReg = REG_NA; regNumber addrIndexReg = REG_NA; - regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT); + regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT(compiler)); if (numArgs == 5) { diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index cb2610315eca24..0cf9154301ef50 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -66,7 +66,7 @@ HARDWARE_INTRINSIC(Vector128, EqualsAll, HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_One, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) @@ -168,7 +168,7 @@ HARDWARE_INTRINSIC(Vector256, EqualsAll, HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, get_One, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) @@ -254,42 +254,42 @@ HARDWARE_INTRINSIC(SSE, Add, HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -306,7 +306,7 @@ HARDWARE_INTRINSIC(SSE, Min, HARDWARE_INTRINSIC(SSE, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, MoveHighToLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, MoveLowToHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, MoveScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -315,10 +315,10 @@ HARDWARE_INTRINSIC(SSE, Prefetch0, HARDWARE_INTRINSIC(SSE, Prefetch1, 0, 1, {INS_invalid, INS_prefetcht1, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, Prefetch2, 0, 1, {INS_invalid, INS_prefetcht2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, PrefetchNonTemporal, 0, 1, {INS_invalid, INS_prefetchnta, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, Shuffle, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -355,42 +355,42 @@ HARDWARE_INTRINSIC(SSE2, AddScalar, HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) @@ -418,7 +418,7 @@ HARDWARE_INTRINSIC(SSE2, MemoryFence, HARDWARE_INTRINSIC(SSE2, MaxScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(SSE2, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE2, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_invalid, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -471,11 +471,11 @@ HARDWARE_INTRINSIC(SSE2_X64, StoreNonTemporal, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE3 Intrinsics -HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE3, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -487,54 +487,54 @@ HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, // SSSE3 Intrinsics HARDWARE_INTRINSIC(SSSE3, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(SSSE3, AlignRight, 16, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSSE3, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, Shuffle, 16, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE41 Intrinsics -HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiIns|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE41, LoadAlignedVector128NonTemporal, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41, Max, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, Min, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE41, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -566,15 +566,15 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX Intrinsics HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Compare, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) @@ -588,7 +588,7 @@ HARDWARE_INTRINSIC(AVX, CompareNotLessThan, HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -597,43 +597,43 @@ HARDWARE_INTRINSIC(AVX, ConvertToVector256Double, HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Divide, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, DuplicateEvenIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, DuplicateOddIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ExtractVector128, 32, 2, {INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, InsertVector128, 32, 3, {INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, LoadAlignedVector256, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, LoadVector256, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Max, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(AVX, Min, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) -HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Or, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Permute, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PermuteVar, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpsvar, INS_vpermilpdvar}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Shuffle, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Store, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, StoreAligned, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, StoreAlignedNonTemporal, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -646,47 +646,47 @@ HARDWARE_INTRINSIC(AVX, Xor, HARDWARE_INTRINSIC(AVX2, Abs, 32, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, ExtractVector128, 32, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ConvertToInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToUInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int16, 32, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int32, 32, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int64, 32, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, InsertVector128, 32, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, LoadAlignedVector256NonTemporal, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Max, 32, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Min, 32, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, MultiplyAddAdjacent, 32, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyHigh, 32, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, MultiplyHighRoundScale, 32, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyLow, 32, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Or, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Permute4x64, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, PermuteVar8x32, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermd, INS_vpermd, INS_invalid, INS_invalid, INS_vpermps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, PackSignedSaturate, 32, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -702,7 +702,7 @@ HARDWARE_INTRINSIC(AVX2, ShiftRightLogicalVariable, HARDWARE_INTRINSIC(AVX2, Shuffle, 32, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_MaybeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleHigh, 32, 2, {INS_invalid, INS_invalid, INS_pshufhw, INS_pshufhw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleLow, 32, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Subtract, 32, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, SubtractSaturate, 32, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -721,56 +721,56 @@ HARDWARE_INTRINSIC(AVXVNNI, MultiplyWideningAndAddSaturate, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AES Intrinsics -HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -807,7 +807,7 @@ HARDWARE_INTRINSIC(LZCNT_X64, LeadingZeroCount, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // PCLMULQDQ Intrinsics -HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -840,8 +840,8 @@ HARDWARE_INTRINSIC(SSE, COMISS, HARDWARE_INTRINSIC(SSE, UCOMISS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, COMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 59c6dc1221f4d0..b2a17f0e60ca22 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -202,8 +202,8 @@ INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit) INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_NONE, INS_FLAGS_None) -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqu32 +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqa32 INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) @@ -341,10 +341,10 @@ INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result // TODO-XArch-AVX512: pand, pandn, por, and pxor have AVX512 instructions under different names, pandd, pandq etc -INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation @@ -493,10 +493,10 @@ INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register -INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed floating point values -INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed integer values -INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values -INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values +INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is insertf32x4 +INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is inserti32x4 INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute 64-bit of input register diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 1203485793fa69..067683f644d8b2 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -274,6 +274,15 @@ regMaskTP LinearScan::allSIMDRegs() return availableFloatRegs; } +regMaskTP LinearScan::lowSIMDRegs() +{ +#if defined(TARGET_AMD64) + return (availableFloatRegs & RBM_LOWFLOAT); +#else + return availableFloatRegs; +#endif +} + void LinearScan::updateNextFixedRef(RegRecord* regRecord, RefPosition* nextRefPosition) { LsraLocation nextLocation; @@ -386,7 +395,7 @@ regMaskTP LinearScan::internalFloatRegCandidates() } else { - return RBM_FLT_CALLEE_TRASH; + return RBM_FLT_CALLEE_TRASH(compiler); } } @@ -463,7 +472,7 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) case LSRA_LIMIT_CALLER: { - mask = getConstrainedRegMask(mask, RBM_CALLEE_TRASH, minRegCount); + mask = getConstrainedRegMask(mask, RBM_CALLEE_TRASH(compiler), minRegCount); } break; @@ -478,6 +487,15 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) } break; +#if defined(TARGET_AMD64) + case LSRA_LIMIT_UPPER_SIMD_SET: + if ((mask & LsraLimitUpperSimdSet) != RBM_NONE) + { + mask = getConstrainedRegMask(mask, LsraLimitUpperSimdSet, minRegCount); + } + break; +#endif + default: unreached(); } @@ -675,8 +693,8 @@ LinearScan::LinearScan(Compiler* theCompiler) availableIntRegs &= ~RBM_FPBASE; #endif // ETW_EBP_FRAMED - availableFloatRegs = RBM_ALLFLOAT; - availableDoubleRegs = RBM_ALLDOUBLE; + availableFloatRegs = RBM_ALLFLOAT(compiler); + availableDoubleRegs = RBM_ALLDOUBLE(compiler); #if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (compiler->opts.compDbgEnC) @@ -688,6 +706,7 @@ LinearScan::LinearScan(Compiler* theCompiler) availableDoubleRegs &= ~RBM_CALLEE_SAVED; } #endif // TARGET_AMD64 || TARGET_ARM64 + compiler->rpFrameType = FT_NOT_SET; compiler->rpMustCreateEBPCalled = false; @@ -7453,7 +7472,7 @@ regNumber LinearScan::getTempRegForResolution(BasicBlock* fromBlock, BasicBlock* if (type == TYP_DOUBLE) { // Exclude any doubles for which the odd half isn't in freeRegs. - freeRegs = freeRegs & ((freeRegs << 1) & RBM_ALLDOUBLE); + freeRegs = freeRegs & ((freeRegs << 1) & RBM_ALLDOUBLE(this)); } #endif @@ -8955,14 +8974,16 @@ void dumpRegMask(regMaskTP regs) { printf("[allIntButFP]"); } - else if (regs == RBM_ALLFLOAT) + /* + else if (regs == RBM_ALLFLOAT(0)) { printf("[allFloat]"); } - else if (regs == RBM_ALLDOUBLE) + else if (regs == RBM_ALLDOUBLE(0)) { printf("[allDouble]"); } + */ else { dspRegMask(regs); @@ -11864,7 +11885,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } else { - callerCalleePrefs = callerSaveRegs(currentInterval->registerType); + callerCalleePrefs = callerSaveRegs(currentInterval->registerType, linearScan->compiler); } // If this has a delayed use (due to being used in a rmw position of a diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index be371b6a626f85..4a9f8fb119301b 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -75,9 +75,9 @@ inline regMaskTP calleeSaveRegs(RegisterType rt) //------------------------------------------------------------------------ // callerSaveRegs: Get the set of caller-save registers of the given RegisterType // -inline regMaskTP callerSaveRegs(RegisterType rt) +inline regMaskTP callerSaveRegs(RegisterType rt, Compiler *compiler) { - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH(compiler); } //------------------------------------------------------------------------ @@ -737,7 +737,7 @@ class LinearScan : public LinearScanInterface // This controls the registers available for allocation enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, LSRA_LIMIT_CALLEE = 0x1, LSRA_LIMIT_CALLER = 0x2, - LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_MASK = 0x3}; + LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_UPPER_SIMD_SET = 0x4, LSRA_LIMIT_MASK = 0x7}; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. @@ -757,6 +757,8 @@ class LinearScan : public LinearScanInterface (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI); #endif // !UNIX_AMD64_ABI static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7); + static const regMaskTP LsraLimitUpperSimdSet = (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 + | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31); #elif defined(TARGET_ARM) // On ARM, we may need two registers to set up the target register for a virtual call, so we need // to have at least the maximum number of arg registers, plus 2. @@ -1062,6 +1064,7 @@ class LinearScan : public LinearScanInterface regMaskTP allRegs(RegisterType rt); regMaskTP allByteRegs(); regMaskTP allSIMDRegs(); + regMaskTP lowSIMDRegs(); regMaskTP internalFloatRegCandidates(); void makeRegisterInactive(RegRecord* physRegRecord); diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index 2c8a16af1a8646..dcd23a6c4e4917 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -811,7 +811,7 @@ int LinearScan::BuildCast(GenTreeCast* cast) // Floating point to integer casts requires a temporary register. if (varTypeIsFloating(srcType) && !varTypeIsFloating(castType)) { - buildInternalFloatRegisterDefForNode(cast, RBM_ALLFLOAT); + buildInternalFloatRegisterDefForNode(cast, RBM_ALLFLOAT(compiler)); setInternalRegsDelayFree = true; } #endif diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index d5510858a2b9fa..89317145c3906b 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -557,6 +557,21 @@ RefPosition* LinearScan::newRefPosition(Interval* theInterval, { if (theInterval != nullptr) { +#if defined(TARGET_AMD64) + if (mask == RBM_LOWSIMD) + { + // Constrain if we have to for float/simd types + if (varTypeIsFloating(theInterval->registerType) || varTypeIsSIMD(theInterval->registerType)) + { + mask = lowSIMDRegs(); + } + else + { + mask = RBM_NONE; + } + + } +#endif if (mask == RBM_NONE) { mask = allRegs(theInterval->registerType); @@ -784,7 +799,7 @@ regMaskTP LinearScan::getKillSetForStoreInd(GenTreeStoreInd* tree) // the allocated register for the `data` operand. However, all the (x86) optimized // helpers have the same kill set: EDX. And note that currently, only x86 can return // `true` for genUseOptimizedWriteBarriers(). - killMask = RBM_CALLEE_TRASH_NOGC; + killMask = RBM_CALLEE_TRASH_NOGC(compiler); } else { @@ -871,7 +886,7 @@ regMaskTP LinearScan::getKillSetForModDiv(GenTreeOp* node) // regMaskTP LinearScan::getKillSetForCall(GenTreeCall* call) { - regMaskTP killMask = RBM_CALLEE_TRASH; + regMaskTP killMask = RBM_CALLEE_TRASH(compiler); #ifdef TARGET_X86 if (compiler->compFloatingPointUsed) { @@ -894,7 +909,7 @@ regMaskTP LinearScan::getKillSetForCall(GenTreeCall* call) // if there is no FP used, we can ignore the FP kills if (!compiler->compFloatingPointUsed) { - killMask &= ~RBM_FLT_CALLEE_TRASH; + killMask &= ~RBM_FLT_CALLEE_TRASH(compiler); } #ifdef TARGET_ARM if (call->IsVirtualStub()) @@ -1188,7 +1203,7 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo continue; } Interval* interval = getIntervalForLocalVar(varIndex); - const bool isCallKill = ((killMask == RBM_INT_CALLEE_TRASH) || (killMask == RBM_CALLEE_TRASH)); + const bool isCallKill = ((killMask == RBM_INT_CALLEE_TRASH) || (killMask == RBM_CALLEE_TRASH(compiler))); if (isCallKill) { @@ -1498,7 +1513,7 @@ void LinearScan::buildUpperVectorSaveRefPositions(GenTree* tree, LsraLocation cu { // We assume that the kill set includes at least some callee-trash registers, but // that it doesn't include any callee-save registers. - assert((fpCalleeKillSet & RBM_FLT_CALLEE_TRASH) != RBM_NONE); + assert((fpCalleeKillSet & RBM_FLT_CALLEE_TRASH(compiler)) != RBM_NONE); assert((fpCalleeKillSet & RBM_FLT_CALLEE_SAVED) == RBM_NONE); // We only need to save the upper half of any large vector vars that are currently live. @@ -2008,7 +2023,7 @@ void LinearScan::UpdateRegStateForStructArg(LclVarDsc* argDsc) if ((argDsc->GetArgReg() != REG_STK) && (argDsc->GetArgReg() != REG_NA)) { - if (genRegMask(argDsc->GetArgReg()) & (RBM_ALLFLOAT)) + if (genRegMask(argDsc->GetArgReg()) & (RBM_ALLFLOAT(compiler))) { assert(genRegMask(argDsc->GetArgReg()) & (RBM_FLTARG_REGS)); floatRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->GetArgReg()); @@ -2022,7 +2037,7 @@ void LinearScan::UpdateRegStateForStructArg(LclVarDsc* argDsc) if ((argDsc->GetOtherArgReg() != REG_STK) && (argDsc->GetOtherArgReg() != REG_NA)) { - if (genRegMask(argDsc->GetOtherArgReg()) & (RBM_ALLFLOAT)) + if (genRegMask(argDsc->GetOtherArgReg()) & (RBM_ALLFLOAT(compiler))) { assert(genRegMask(argDsc->GetOtherArgReg()) & (RBM_FLTARG_REGS)); floatRegState->rsCalleeRegArgMaskLiveIn |= genRegMask(argDsc->GetOtherArgReg()); @@ -2944,7 +2959,7 @@ void LinearScan::BuildDefsWithKills(GenTree* tree, int dstCount, regMaskTP dstCa // RefPositions in that case. // This must be done after the kills, so that we know which large vectors are still live. // - if ((killMask & RBM_FLT_CALLEE_TRASH) != RBM_NONE) + if ((killMask & RBM_FLT_CALLEE_TRASH(compiler)) != RBM_NONE) { buildUpperVectorSaveRefPositions(tree, currentLoc + 1, killMask); } @@ -3844,7 +3859,7 @@ int LinearScan::BuildReturn(GenTree* tree) break; case TYP_DOUBLE: // We ONLY want the valid double register in the RBM_DOUBLERET mask. - useCandidates = (RBM_DOUBLERET & RBM_ALLDOUBLE); + useCandidates = (RBM_DOUBLERET & RBM_ALLDOUBLE(compiler)); break; case TYP_LONG: useCandidates = RBM_LNGRET; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 940a6d68858c57..9360cf2d41daa6 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -152,7 +152,12 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = 0; assert(dstCount == 1); assert(!tree->IsReuseRegVal()); - RefPosition* def = BuildDef(tree); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + RefPosition* def = BuildDef(tree, opRegMask); def->getInterval()->isConstant = true; } break; @@ -1836,21 +1841,30 @@ int LinearScan::BuildIntrinsic(GenTree* tree) break; } assert(tree->gtGetOp2IfPresent() == nullptr); + + // TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs + // can be lowered to EVEX compatible instruction (the rest cannot) +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + int srcCount; if (op1->isContained()) { - srcCount = BuildOperandUses(op1); + srcCount = BuildOperandUses(op1, opRegMask); } else { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, opRegMask); srcCount = 1; } if (internalFloatDef != nullptr) { buildInternalRegisterUses(); } - BuildDef(tree); + BuildDef(tree, opRegMask); return srcCount; } @@ -2048,6 +2062,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); + bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(compiler); // Create internal temps, and handle any other special requirements. // Note that the default case for building uses will handle the RMW flag, but if the uses @@ -2129,9 +2144,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(numArgs == 3); assert(!isRMW); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif // MaskMove hardcodes the destination (op3) in DI/EDI/RDI - srcCount += BuildOperandUses(op1); - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op1, opRegMask); + srcCount += BuildOperandUses(op2, opRegMask); srcCount += BuildOperandUses(op3, RBM_EDI); buildUses = false; @@ -2146,11 +2166,17 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(isRMW); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, opRegMask); srcCount += 1; - srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op1); + srcCount += op2->isContained() ? BuildOperandUses(op2, opRegMask) : BuildDelayFreeUses(op2, op1, opRegMask); srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0); buildUses = false; @@ -2344,15 +2370,22 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(numArgs == 3); assert(!isRMW); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + + // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); + srcCount += BuildOperandUses(op1, opRegMask); + srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); // op3 should always be contained assert(op3->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2367,17 +2400,24 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* op4 = intrinsicTree->Op(4); GenTree* op5 = intrinsicTree->Op(5); +#if defined(TARGET_AMD64) + regMaskTP opRegMask = RBM_LOWSIMD; +#else + regMaskTP opRegMask = RBM_NONE; +#endif + + // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); - srcCount += BuildDelayFreeUses(op3); - srcCount += BuildDelayFreeUses(op4); + srcCount += BuildOperandUses(op1, opRegMask); + srcCount += BuildDelayFreeUses(op2, nullptr, opRegMask); + srcCount += BuildDelayFreeUses(op3, nullptr, opRegMask); + srcCount += BuildDelayFreeUses(op4, nullptr, opRegMask); // op5 should always be contained assert(op5->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2395,25 +2435,40 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert((numArgs > 0) && (numArgs < 4)); + regMaskTP op1RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op1RegCandidates = RBM_LOWSIMD; + } +#endif + if (intrinsicTree->OperIsMemoryLoadOrStore()) { - srcCount += BuildAddrUses(op1); + srcCount += BuildAddrUses(op1, op1RegCandidates); } else if (isRMW && !op1->isContained()) { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, op1RegCandidates); srcCount += 1; } else { - srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op1, op1RegCandidates); } if (op2 != nullptr) { + regMaskTP op2RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op2RegCandidates = RBM_LOWSIMD; + } +#endif if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained()) { - srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1)); + srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1), op2RegCandidates); } else if (isRMW) { @@ -2422,7 +2477,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained and we are commutative, we can set op2 // to also be a tgtPrefUse. Codegen will then swap the operands. - tgtPrefUse2 = BuildUse(op2); + tgtPrefUse2 = BuildUse(op2, op2RegCandidates); srcCount += 1; } else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet())) @@ -2430,7 +2485,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained or if we are producing a scalar value // we need to mark it as delay free because the operand and target // exist in the same register set. - srcCount += BuildDelayFreeUses(op2, op1); + srcCount += BuildDelayFreeUses(op2, op1, op2RegCandidates); } else { @@ -2438,17 +2493,24 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // have no concerns of overwriting op2 because they exist in different // register sets. - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } } else { - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } if (op3 != nullptr) { - srcCount += isRMW ? BuildDelayFreeUses(op3, op1) : BuildOperandUses(op3); + regMaskTP op3RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op3RegCandidates = RBM_LOWSIMD; + } +#endif + srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) : BuildOperandUses(op3, op3RegCandidates); } } } @@ -2458,6 +2520,13 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (dstCount == 1) { +#if defined(TARGET_AMD64) + if (!intrinsicTree->isEvexCompatibleHWIntrinsic(compiler)) + { + dstCandidates = RBM_LOWSIMD; + } +#endif + BuildDef(intrinsicTree, dstCandidates); } else diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 0e9e43aeeeabf5..0ae32a6a459f0c 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -7289,7 +7289,14 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) availRegCount = CNT_CALLEE_SAVED_FLOAT; if (!loopContainsCall) { - availRegCount += CNT_CALLEE_TRASH_FLOAT - 1; + availRegCount += (CNT_CALLEE_TRASH_FLOAT - 1); + // TODO-XARCH-AVX512 fix this +#if defined(TARGET_AMD64) + if (!DoJitStressEvexEncoding()) + { + availRegCount -= 16; + } +#endif } #ifdef TARGET_ARM // For ARM each double takes two FP registers diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index 6f63bc51211d63..ca90673e85adfe 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -94,7 +94,27 @@ REGDEF(XMM12, 12+XMMBASE, XMMMASK(12), "mm12" ) REGDEF(XMM13, 13+XMMBASE, XMMMASK(13), "mm13" ) REGDEF(XMM14, 14+XMMBASE, XMMMASK(14), "mm14" ) REGDEF(XMM15, 15+XMMBASE, XMMMASK(15), "mm15" ) -REGDEF(STK, 16+XMMBASE, 0x0000, "STK" ) + +REGDEF(XMM16, 16+XMMBASE, XMMMASK(16), "mm16" ) +REGDEF(XMM17, 17+XMMBASE, XMMMASK(17), "mm17" ) +REGDEF(XMM18, 18+XMMBASE, XMMMASK(18), "mm18" ) +REGDEF(XMM19, 19+XMMBASE, XMMMASK(19), "mm19" ) +REGDEF(XMM20, 20+XMMBASE, XMMMASK(20), "mm20" ) +REGDEF(XMM21, 21+XMMBASE, XMMMASK(21), "mm21" ) +REGDEF(XMM22, 22+XMMBASE, XMMMASK(22), "mm22" ) +REGDEF(XMM23, 23+XMMBASE, XMMMASK(23), "mm23" ) + +REGDEF(XMM24, 24+XMMBASE, XMMMASK(24), "mm24" ) +REGDEF(XMM25, 25+XMMBASE, XMMMASK(25), "mm25" ) +REGDEF(XMM26, 26+XMMBASE, XMMMASK(26), "mm26" ) +REGDEF(XMM27, 27+XMMBASE, XMMMASK(27), "mm27" ) +REGDEF(XMM28, 28+XMMBASE, XMMMASK(28), "mm28" ) +REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) +REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) +REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) + +REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) + #endif // !TARGET_X86 #elif defined(TARGET_ARM) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 392a5417141398..b9b2f82a85a781 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -61,7 +61,11 @@ inline bool compUnixX86Abi() /*****************************************************************************/ // The following are intended to capture only those #defines that cannot be replaced // with static const members of Target -#if defined(TARGET_XARCH) +#if defined(TARGET_AMD64) +#define REGMASK_BITS 64 +#define CSE_CONST_SHARED_LOW_BITS 16 + +#elif defined(TARGET_XARCH) #define REGMASK_BITS 32 #define CSE_CONST_SHARED_LOW_BITS 16 @@ -135,7 +139,7 @@ enum _regMask_enum : unsigned __int64 #elif defined(TARGET_AMD64) -enum _regNumber_enum : unsigned +enum _regNumber_enum : unsigned { #define REGDEF(name, rnum, mask, sname) REG_##name = rnum, #define REGALIAS(alias, realname) REG_##alias = REG_##realname, @@ -146,13 +150,16 @@ enum _regNumber_enum : unsigned ACTUAL_REG_COUNT = REG_COUNT - 1 // everything but REG_STK (only real regs) }; -enum _regMask_enum : unsigned +enum _regMask_enum : unsigned __int64 { RBM_NONE = 0, + RBM_LOWSIMD = 1LL << 63, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, #include "register.h" + + }; #elif defined(TARGET_X86) @@ -192,7 +199,7 @@ enum _regMask_enum : unsigned // In any case, we believe that is OK to freely cast between these types; no information will // be lost. -#if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) typedef unsigned __int64 regMaskTP; #else typedef unsigned regMaskTP; @@ -528,7 +535,7 @@ inline regMaskTP genRegMask(regNumber reg) // (L1 latency on sandy bridge is 4 cycles for [base] and 5 for [base + index*c] ) // the reason this is AMD-only is because the x86 BE will try to get reg masks for REG_STK // and the result needs to be zero. - regMaskTP result = 1 << reg; + regMaskTP result = 1ULL << reg; assert(result == regMasks[reg]); return result; #else diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 4ec128a6345d21..507db71db3d615 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -78,10 +78,13 @@ #endif // !UNIX_AMD64_ABI #define CSE_CONSTS 1 // Enable if we want to CSE constants - #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15) - #define RBM_ALLDOUBLE RBM_ALLFLOAT + #define RBM_LOWFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 ) + #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + #define RBM_ALLFLOAT(c) ((c->DoJitStressEvexEncoding()) ? RBM_LOWFLOAT | RBM_HIGHFLOAT : RBM_LOWFLOAT) + + #define RBM_ALLDOUBLE(c) RBM_ALLFLOAT(c) #define REG_FP_FIRST REG_XMM0 - #define REG_FP_LAST REG_XMM15 + #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 #ifdef UNIX_AMD64_ABI @@ -117,7 +120,7 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_RDI|RBM_RSI|RBM_EDX|RBM_ECX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (0) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ + #define RBM_FLT_CALLEE_TRASH(c) (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) #define REG_PROFILER_ENTER_ARG_0 REG_R14 #define RBM_PROFILER_ENTER_ARG_0 RBM_R14 @@ -132,7 +135,8 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) + #define RBM_FLT_CALLEE_TRASH_OLD (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM16|RBM_XMM17|RBM_XMM18|RBM_XMM19|RBM_XMM20|RBM_XMM21|RBM_XMM22|RBM_XMM23|RBM_XMM24|RBM_XMM25|RBM_XMM26|RBM_XMM27|RBM_XMM28|RBM_XMM29|RBM_XMM30|RBM_XMM31) + #define RBM_FLT_CALLEE_TRASH(c) (c->DoJitStressEvexEncoding() ? RBM_FLT_CALLEE_TRASH_OLD : (RBM_FLT_CALLEE_TRASH_OLD & ~RBM_HIGHFLOAT)) #endif // !UNIX_AMD64_ABI #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) @@ -140,7 +144,7 @@ #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_TRASH(c) (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH(c)) #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -169,20 +173,20 @@ #define REG_WRITE_BARRIER_SRC REG_ARG_1 #define RBM_WRITE_BARRIER_SRC RBM_ARG_1 - #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH + #define RBM_CALLEE_TRASH_NOGC(c) RBM_CALLEE_TRASH(c) // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_TRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_TRASH_WRITEBARRIER(c) RBM_CALLEE_TRASH_NOGC(c) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_GCTRASH_WRITEBARRIER(c) RBM_CALLEE_TRASH_NOGC(c) // Registers killed by CORINFO_HELP_ASSIGN_BYREF. - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF(c) (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC(c)) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. // Note that RDI and RSI are still valid byref pointers after this helper call, despite their value being changed. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF(c) (RBM_CALLEE_TRASH_NOGC(c) & ~(RBM_RDI | RBM_RSI)) #if 0 #define REG_VAR_ORDER REG_EAX,REG_EDX,REG_ECX,REG_ESI,REG_EDI,REG_EBX,REG_ETW_FRAMED_EBP_LIST \ @@ -203,7 +207,8 @@ #endif // !UNIX_AMD64_ABI #endif - #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15 + #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15,REG_XMM16,REG_XMM17,REG_XMM18,REG_XMM19,REG_XMM20,REG_XMM21,REG_XMM22,REG_XMM23,REG_XMM24,REG_XMM25,REG_XMM26,REG_XMM27,REG_XMM28,REG_XMM29,REG_XMM30,REG_XMM31 + #ifdef UNIX_AMD64_ABI #define CNT_CALLEE_SAVED (5 + REG_ETW_FRAMED_EBP_COUNT) @@ -221,7 +226,7 @@ #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) #define CNT_CALLEE_SAVED_FLOAT (10) - #define CNT_CALLEE_TRASH_FLOAT (6) + #define CNT_CALLEE_TRASH_FLOAT (22) #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -413,8 +418,8 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\amd64\asmhelpers.asm for more details. - #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH - #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH + #define RBM_PROFILER_ENTER_TRASH(c) RBM_CALLEE_TRASH(c) + #define RBM_PROFILER_TAILCALL_TRASH(c) RBM_PROFILER_LEAVE_TRASH(c) // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper. #ifdef UNIX_AMD64_ABI @@ -423,16 +428,16 @@ // On Unix a struct of size >=9 and <=16 bytes in size is returned in two return registers. // The return registers could be any two from the set { RAX, RDX, XMM0, XMM1 }. // STOP_FOR_GC helper preserves all the 4 possible return registers. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) - #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_STOP_FOR_GC_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_PROFILER_LEAVE_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) #else // See vm\amd64\asmhelpers.asm for more details. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) - #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_STOP_FOR_GC_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_PROFILER_LEAVE_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_FLOATRET | RBM_INTRET)) #endif // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. - #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH + #define RBM_INIT_PINVOKE_FRAME_TRASH(c) RBM_CALLEE_TRASH(c) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R10 | RBM_RCX)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_RCX diff --git a/src/coreclr/jit/targetarm.h b/src/coreclr/jit/targetarm.h index 60b01d6063dcbd..1b9135238b9342 100644 --- a/src/coreclr/jit/targetarm.h +++ b/src/coreclr/jit/targetarm.h @@ -62,17 +62,17 @@ #define RBM_INT_CALLEE_SAVED (RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10) #define RBM_INT_CALLEE_TRASH (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R12|RBM_LR) #define RBM_FLT_CALLEE_SAVED (RBM_F16|RBM_F17|RBM_F18|RBM_F19|RBM_F20|RBM_F21|RBM_F22|RBM_F23|RBM_F24|RBM_F25|RBM_F26|RBM_F27|RBM_F28|RBM_F29|RBM_F30|RBM_F31) - #define RBM_FLT_CALLEE_TRASH (RBM_F0|RBM_F1|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7|RBM_F8|RBM_F9|RBM_F10|RBM_F11|RBM_F12|RBM_F13|RBM_F14|RBM_F15) + #define RBM_FLT_CALLEE_TRASH(c) (RBM_F0|RBM_F1|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7|RBM_F8|RBM_F9|RBM_F10|RBM_F11|RBM_F12|RBM_F13|RBM_F14|RBM_F15) #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_TRASH(c) (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH(c)) #define REG_DEFAULT_HELPER_CALL_TARGET REG_R12 #define RBM_DEFAULT_HELPER_CALL_TARGET RBM_R12 #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) - #define RBM_ALLFLOAT (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH) - #define RBM_ALLDOUBLE (RBM_F0|RBM_F2|RBM_F4|RBM_F6|RBM_F8|RBM_F10|RBM_F12|RBM_F14|RBM_F16|RBM_F18|RBM_F20|RBM_F22|RBM_F24|RBM_F26|RBM_F28|RBM_F30) + #define RBM_ALLFLOAT(c) (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH(__noop)) + #define RBM_ALLDOUBLE(c) (RBM_F0|RBM_F2|RBM_F4|RBM_F6|RBM_F8|RBM_F10|RBM_F12|RBM_F14|RBM_F16|RBM_F18|RBM_F20|RBM_F22|RBM_F24|RBM_F26|RBM_F28|RBM_F30) #define REG_VAR_ORDER REG_R3,REG_R2,REG_R1,REG_R0,REG_R4,REG_LR,REG_R12,\ REG_R5,REG_R6,REG_R7,REG_R8,REG_R9,REG_R10 @@ -163,20 +163,20 @@ #define REG_WRITE_BARRIER_SRC_BYREF REG_ARG_1 #define RBM_WRITE_BARRIER_SRC_BYREF RBM_ARG_1 - #define RBM_CALLEE_TRASH_NOGC (RBM_R2|RBM_R3|RBM_LR|RBM_DEFAULT_HELPER_CALL_TARGET) + #define RBM_CALLEE_TRASH_NOGC(c) (RBM_R2|RBM_R3|RBM_LR|RBM_DEFAULT_HELPER_CALL_TARGET) // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_TRASH_WRITEBARRIER (RBM_R0|RBM_R3|RBM_LR|RBM_DEFAULT_HELPER_CALL_TARGET) + #define RBM_CALLEE_TRASH_WRITEBARRIER(c) (RBM_R0|RBM_R3|RBM_LR|RBM_DEFAULT_HELPER_CALL_TARGET) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER RBM_CALLEE_TRASH_WRITEBARRIER + #define RBM_CALLEE_GCTRASH_WRITEBARRIER(c) RBM_CALLEE_TRASH_WRITEBARRIER(c) // Registers killed by CORINFO_HELP_ASSIGN_BYREF. - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC) + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF(c) (RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC(c)) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. // Note that r0 and r1 are still valid byref pointers after this helper call, despite their value being changed. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF(c) RBM_CALLEE_TRASH_NOGC(c) // GenericPInvokeCalliHelper VASigCookie Parameter #define REG_PINVOKE_COOKIE_PARAM REG_R4 @@ -221,11 +221,11 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\arm\asmhelpers.asm for more details. - #define RBM_PROFILER_ENTER_TRASH RBM_NONE + #define RBM_PROFILER_ENTER_TRASH(c) RBM_NONE // While REG_PROFILER_RET_SCRATCH is not trashed by the method, the register allocator must // consider it killed by the return. - #define RBM_PROFILER_LEAVE_TRASH RBM_PROFILER_RET_SCRATCH - #define RBM_PROFILER_TAILCALL_TRASH RBM_NONE + #define RBM_PROFILER_LEAVE_TRASH(c) RBM_PROFILER_RET_SCRATCH + #define RBM_PROFILER_TAILCALL_TRASH(c) RBM_NONE // Which register are int and long values returned in ? #define REG_INTRET REG_R0 @@ -242,10 +242,10 @@ // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper (JIT_RareDisableHelper). // See vm\arm\amshelpers.asm for more details. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_LNGRET|RBM_R7|RBM_R8|RBM_R11|RBM_DOUBLERET|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7)) + #define RBM_STOP_FOR_GC_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_LNGRET|RBM_R7|RBM_R8|RBM_R11|RBM_DOUBLERET|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7)) // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. - #define RBM_INIT_PINVOKE_FRAME_TRASH (RBM_CALLEE_TRASH | RBM_PINVOKE_TCB | RBM_PINVOKE_SCRATCH) + #define RBM_INIT_PINVOKE_FRAME_TRASH(c) (RBM_CALLEE_TRASH(c) | RBM_PINVOKE_TCB | RBM_PINVOKE_SCRATCH) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_R0 @@ -284,7 +284,7 @@ #define RBM_ARG_REGS (RBM_ARG_0|RBM_ARG_1|RBM_ARG_2|RBM_ARG_3) #define RBM_FLTARG_REGS (RBM_F0|RBM_F1|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7|RBM_F8|RBM_F9|RBM_F10|RBM_F11|RBM_F12|RBM_F13|RBM_F14|RBM_F15) - #define RBM_DBL_REGS RBM_ALLDOUBLE + #define RBM_DBL_REGS RBM_ALLDOUBLE(0) extern const regNumber fltArgRegs [MAX_FLOAT_REG_ARG]; extern const regMaskTP fltArgMasks[MAX_FLOAT_REG_ARG]; diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index b30c7af7e40d98..cccbb6bd4f110e 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -70,17 +70,17 @@ #define RBM_INT_CALLEE_SAVED (RBM_R19|RBM_R20|RBM_R21|RBM_R22|RBM_R23|RBM_R24|RBM_R25|RBM_R26|RBM_R27|RBM_R28) #define RBM_INT_CALLEE_TRASH (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10|RBM_R11|RBM_R12|RBM_R13|RBM_R14|RBM_R15|RBM_IP0|RBM_IP1|RBM_LR) #define RBM_FLT_CALLEE_SAVED (RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15) - #define RBM_FLT_CALLEE_TRASH (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V16|RBM_V17|RBM_V18|RBM_V19|RBM_V20|RBM_V21|RBM_V22|RBM_V23|RBM_V24|RBM_V25|RBM_V26|RBM_V27|RBM_V28|RBM_V29|RBM_V30|RBM_V31) + #define RBM_FLT_CALLEE_TRASH(c) (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V16|RBM_V17|RBM_V18|RBM_V19|RBM_V20|RBM_V21|RBM_V22|RBM_V23|RBM_V24|RBM_V25|RBM_V26|RBM_V27|RBM_V28|RBM_V29|RBM_V30|RBM_V31) #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_TRASH(c) (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH(c)) #define REG_DEFAULT_HELPER_CALL_TARGET REG_R12 #define RBM_DEFAULT_HELPER_CALL_TARGET RBM_R12 #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) - #define RBM_ALLFLOAT (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH) - #define RBM_ALLDOUBLE RBM_ALLFLOAT + #define RBM_ALLFLOAT(c) (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH(__noop)) + #define RBM_ALLDOUBLE(c) RBM_ALLFLOAT(c) // REG_VAR_ORDER is: (CALLEE_TRASH & ~CALLEE_TRASH_NOGC), CALLEE_TRASH_NOGC, CALLEE_SAVED #define REG_VAR_ORDER REG_R0, REG_R1, REG_R2, REG_R3, REG_R4, REG_R5, \ @@ -175,20 +175,20 @@ #define REG_WRITE_BARRIER_SRC_BYREF REG_R13 #define RBM_WRITE_BARRIER_SRC_BYREF RBM_R13 - #define RBM_CALLEE_TRASH_NOGC (RBM_R12|RBM_R15|RBM_IP0|RBM_IP1|RBM_DEFAULT_HELPER_CALL_TARGET) + #define RBM_CALLEE_TRASH_NOGC(c) (RBM_R12|RBM_R15|RBM_IP0|RBM_IP1|RBM_DEFAULT_HELPER_CALL_TARGET) // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_TRASH_WRITEBARRIER (RBM_R14|RBM_CALLEE_TRASH_NOGC) + #define RBM_CALLEE_TRASH_WRITEBARRIER(c) (RBM_R14|RBM_CALLEE_TRASH_NOGC(c)) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_GCTRASH_WRITEBARRIER(c) RBM_CALLEE_TRASH_NOGC(c) // Registers killed by CORINFO_HELP_ASSIGN_BYREF. - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC) + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF(c) (RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC(c)) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. // Note that x13 and x14 are still valid byref pointers after this helper call, despite their value being changed. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF RBM_CALLEE_TRASH_NOGC + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF(c) RBM_CALLEE_TRASH_NOGC(c) // GenericPInvokeCalliHelper VASigCookie Parameter #define REG_PINVOKE_COOKIE_PARAM REG_R15 @@ -236,9 +236,9 @@ #define RBM_PROFILER_LEAVE_ARG_CALLER_SP RBM_R11 // The registers trashed by profiler enter/leave/tailcall hook - #define RBM_PROFILER_ENTER_TRASH (RBM_CALLEE_TRASH & ~(RBM_ARG_REGS|RBM_ARG_RET_BUFF|RBM_FLTARG_REGS|RBM_FP)) - #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_ARG_REGS|RBM_ARG_RET_BUFF|RBM_FLTARG_REGS|RBM_FP)) - #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH + #define RBM_PROFILER_ENTER_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_ARG_REGS|RBM_ARG_RET_BUFF|RBM_FLTARG_REGS|RBM_FP)) + #define RBM_PROFILER_LEAVE_TRASH(c) (RBM_CALLEE_TRASH(c) & ~(RBM_ARG_REGS|RBM_ARG_RET_BUFF|RBM_FLTARG_REGS|RBM_FP)) + #define RBM_PROFILER_TAILCALL_TRASH(c) RBM_PROFILER_LEAVE_TRASH(c) // Which register are int and long values returned in ? #define REG_INTRET REG_R0 @@ -253,10 +253,10 @@ #define RBM_DOUBLERET RBM_V0 // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper - #define RBM_STOP_FOR_GC_TRASH RBM_CALLEE_TRASH + #define RBM_STOP_FOR_GC_TRASH(c) RBM_CALLEE_TRASH(c) // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. - #define RBM_INIT_PINVOKE_FRAME_TRASH RBM_CALLEE_TRASH + #define RBM_INIT_PINVOKE_FRAME_TRASH(c) RBM_CALLEE_TRASH(c) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~(RBM_R0 | RBM_R1 | RBM_R2 | RBM_R3 | RBM_R4 | RBM_R5 | RBM_R6 | RBM_R7 | RBM_R8 | RBM_R15)) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_R15 diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index 09c6b6b0b04ef9..2dd2f2ada40667 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -88,13 +88,13 @@ #define RBM_FLTARG_REGS (RBM_FLTARG_0|RBM_FLTARG_1|RBM_FLTARG_2|RBM_FLTARG_3) - #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) - #define RBM_ALLDOUBLE RBM_ALLFLOAT + #define RBM_ALLFLOAT(c) (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) + #define RBM_ALLDOUBLE(c) RBM_ALLFLOAT(c) // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. #define RBM_FLT_CALLEE_SAVED RBM_NONE - #define RBM_FLT_CALLEE_TRASH RBM_ALLFLOAT + #define RBM_FLT_CALLEE_TRASH(c) RBM_ALLFLOAT(c) #define REG_VAR_ORDER_FLT REG_XMM0, REG_XMM1, REG_XMM2, REG_XMM3, REG_XMM4, REG_XMM5, REG_XMM6, REG_XMM7 #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 @@ -121,7 +121,7 @@ #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX) #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_TRASH(c) (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH(c)) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -200,7 +200,7 @@ #define RBM_OPTIMIZED_WRITE_BARRIER_SRC (RBM_EAX|RBM_ECX|RBM_EBX|RBM_ESI|RBM_EDI) #endif // NOGC_WRITE_BARRIERS - #define RBM_CALLEE_TRASH_NOGC RBM_EDX + #define RBM_CALLEE_TRASH_NOGC(c) RBM_EDX // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. // Note that x86 normally emits an optimized (source-register-specific) write barrier, but can emit @@ -208,20 +208,20 @@ CLANG_FORMAT_COMMENT_ANCHOR; #ifdef FEATURE_USE_ASM_GC_WRITE_BARRIERS - #define RBM_CALLEE_TRASH_WRITEBARRIER (RBM_EAX | RBM_EDX) + #define RBM_CALLEE_TRASH_WRITEBARRIER(c) (RBM_EAX | RBM_EDX) #else // !FEATURE_USE_ASM_GC_WRITE_BARRIERS - #define RBM_CALLEE_TRASH_WRITEBARRIER RBM_CALLEE_TRASH + #define RBM_CALLEE_TRASH_WRITEBARRIER(c) RBM_CALLEE_TRASH(c) #endif // !FEATURE_USE_ASM_GC_WRITE_BARRIERS // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER RBM_EDX + #define RBM_CALLEE_GCTRASH_WRITEBARRIER(c) RBM_EDX // Registers killed by CORINFO_HELP_ASSIGN_BYREF. - #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_ESI | RBM_EDI | RBM_ECX) + #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF(c) (RBM_ESI | RBM_EDI | RBM_ECX) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. // Note that RDI and RSI are still valid byref pointers after this helper call, despite their value being changed. - #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF RBM_ECX + #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF(c) RBM_ECX // GenericPInvokeCalliHelper unmanaged target parameter #define REG_PINVOKE_TARGET_PARAM REG_EAX @@ -269,11 +269,11 @@ #define RBM_DOUBLERET RBM_NONE // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper - #define RBM_STOP_FOR_GC_TRASH RBM_CALLEE_TRASH + #define RBM_STOP_FOR_GC_TRASH(c) RBM_CALLEE_TRASH(c) // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. On x86, this helper has a custom calling // convention that takes EDI as argument (but doesn't trash it), trashes EAX, and returns ESI. - #define RBM_INIT_PINVOKE_FRAME_TRASH (RBM_PINVOKE_SCRATCH | RBM_PINVOKE_TCB) + #define RBM_INIT_PINVOKE_FRAME_TRASH(c) (RBM_PINVOKE_SCRATCH | RBM_PINVOKE_TCB) #define RBM_VALIDATE_INDIRECT_CALL_TRASH (RBM_INT_CALLEE_TRASH & ~RBM_ECX) #define REG_VALIDATE_INDIRECT_CALL_ADDR REG_ECX @@ -307,9 +307,9 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\i386\asmhelpers.asm for more details. - #define RBM_PROFILER_ENTER_TRASH RBM_NONE - #define RBM_PROFILER_LEAVE_TRASH RBM_NONE - #define RBM_PROFILER_TAILCALL_TRASH (RBM_CALLEE_TRASH & ~RBM_ARG_REGS) + #define RBM_PROFILER_ENTER_TRASH(c) RBM_NONE + #define RBM_PROFILER_LEAVE_TRASH(c) RBM_NONE + #define RBM_PROFILER_TAILCALL_TRASH(c) (RBM_CALLEE_TRASH(c) & ~RBM_ARG_REGS) // What sort of reloc do we use for [disp32] address mode #define IMAGE_REL_BASED_DISP32 IMAGE_REL_BASED_HIGHLOW diff --git a/src/coreclr/jit/unwindarm.cpp b/src/coreclr/jit/unwindarm.cpp index 8929158713acbb..0cb48991d26a26 100644 --- a/src/coreclr/jit/unwindarm.cpp +++ b/src/coreclr/jit/unwindarm.cpp @@ -193,7 +193,7 @@ void Compiler::unwindEndEpilog() void Compiler::unwindPushPopMaskInt(regMaskTP maskInt, bool useOpsize16) { // floating point registers cannot be specified in 'maskInt' - assert((maskInt & RBM_ALLFLOAT) == 0); + assert((maskInt & RBM_ALLFLOAT(this)) == 0); UnwindInfo* pu = &funCurrentFunc()->uwi; @@ -280,7 +280,7 @@ void Compiler::unwindPushPopMaskInt(regMaskTP maskInt, bool useOpsize16) void Compiler::unwindPushPopMaskFloat(regMaskTP maskFloat) { // Only floating pointer registers can be specified in 'maskFloat' - assert((maskFloat & ~RBM_ALLFLOAT) == 0); + assert((maskFloat & ~RBM_ALLFLOAT(this)) == 0); // If the maskFloat is zero there is no unwind code to emit // @@ -339,7 +339,7 @@ void Compiler::unwindPushMaskInt(regMaskTP maskInt) void Compiler::unwindPushMaskFloat(regMaskTP maskFloat) { // Only floating point registers should be in maskFloat - assert((maskFloat & RBM_ALLFLOAT) == maskFloat); + assert((maskFloat & RBM_ALLFLOAT(this)) == maskFloat); #if defined(FEATURE_CFI_SUPPORT) if (generateCFIUnwindCodes()) @@ -390,7 +390,7 @@ void Compiler::unwindPopMaskFloat(regMaskTP maskFloat) #endif // FEATURE_CFI_SUPPORT // Only floating point registers should be in maskFloat - assert((maskFloat & RBM_ALLFLOAT) == maskFloat); + assert((maskFloat & RBM_ALLFLOAT(this)) == maskFloat); unwindPushPopMaskFloat(maskFloat); }