Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/inc/clrconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableBMI2, W("EnableBMI2"), 1
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableFMA, W("EnableFMA"), 1, "Allows FMA+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableLZCNT, W("EnableLZCNT"), 1, "Allows LZCNT+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnablePCLMULQDQ, W("EnablePCLMULQDQ"), 1, "Allows PCLMULQDQ+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableMOVBE, W("EnableMOVBE"), 1, "Allows MOVBE+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnablePOPCNT, W("EnablePOPCNT"), 1, "Allows POPCNT+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE, W("EnableSSE"), 1, "Allows SSE+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE2, W("EnableSSE2"), 1, "Allows SSE2+ hardware intrinsics to be disabled")
Expand Down
90 changes: 56 additions & 34 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,25 @@ enum CORINFO_InstructionSet
InstructionSet_Vector128=17,
InstructionSet_Vector256=18,
InstructionSet_AVXVNNI=19,
InstructionSet_X86Base_X64=20,
InstructionSet_SSE_X64=21,
InstructionSet_SSE2_X64=22,
InstructionSet_SSE3_X64=23,
InstructionSet_SSSE3_X64=24,
InstructionSet_SSE41_X64=25,
InstructionSet_SSE42_X64=26,
InstructionSet_AVX_X64=27,
InstructionSet_AVX2_X64=28,
InstructionSet_AES_X64=29,
InstructionSet_BMI1_X64=30,
InstructionSet_BMI2_X64=31,
InstructionSet_FMA_X64=32,
InstructionSet_LZCNT_X64=33,
InstructionSet_PCLMULQDQ_X64=34,
InstructionSet_POPCNT_X64=35,
InstructionSet_AVXVNNI_X64=36,
InstructionSet_MOVBE=20,
InstructionSet_X86Base_X64=21,
InstructionSet_SSE_X64=22,
InstructionSet_SSE2_X64=23,
InstructionSet_SSE3_X64=24,
InstructionSet_SSSE3_X64=25,
InstructionSet_SSE41_X64=26,
InstructionSet_SSE42_X64=27,
InstructionSet_AVX_X64=28,
InstructionSet_AVX2_X64=29,
InstructionSet_AES_X64=30,
InstructionSet_BMI1_X64=31,
InstructionSet_BMI2_X64=32,
InstructionSet_FMA_X64=33,
InstructionSet_LZCNT_X64=34,
InstructionSet_PCLMULQDQ_X64=35,
InstructionSet_POPCNT_X64=36,
InstructionSet_AVXVNNI_X64=37,
InstructionSet_MOVBE_X64=38,
#endif // TARGET_AMD64
#ifdef TARGET_X86
InstructionSet_X86Base=1,
Expand All @@ -96,23 +98,25 @@ enum CORINFO_InstructionSet
InstructionSet_Vector128=17,
InstructionSet_Vector256=18,
InstructionSet_AVXVNNI=19,
InstructionSet_X86Base_X64=20,
InstructionSet_SSE_X64=21,
InstructionSet_SSE2_X64=22,
InstructionSet_SSE3_X64=23,
InstructionSet_SSSE3_X64=24,
InstructionSet_SSE41_X64=25,
InstructionSet_SSE42_X64=26,
InstructionSet_AVX_X64=27,
InstructionSet_AVX2_X64=28,
InstructionSet_AES_X64=29,
InstructionSet_BMI1_X64=30,
InstructionSet_BMI2_X64=31,
InstructionSet_FMA_X64=32,
InstructionSet_LZCNT_X64=33,
InstructionSet_PCLMULQDQ_X64=34,
InstructionSet_POPCNT_X64=35,
InstructionSet_AVXVNNI_X64=36,
InstructionSet_MOVBE=20,
InstructionSet_X86Base_X64=21,
InstructionSet_SSE_X64=22,
InstructionSet_SSE2_X64=23,
InstructionSet_SSE3_X64=24,
InstructionSet_SSSE3_X64=25,
InstructionSet_SSE41_X64=26,
InstructionSet_SSE42_X64=27,
InstructionSet_AVX_X64=28,
InstructionSet_AVX2_X64=29,
InstructionSet_AES_X64=30,
InstructionSet_BMI1_X64=31,
InstructionSet_BMI2_X64=32,
InstructionSet_FMA_X64=33,
InstructionSet_LZCNT_X64=34,
InstructionSet_PCLMULQDQ_X64=35,
InstructionSet_POPCNT_X64=36,
InstructionSet_AVXVNNI_X64=37,
InstructionSet_MOVBE_X64=38,
#endif // TARGET_X86

};
Expand Down Expand Up @@ -212,6 +216,8 @@ struct CORINFO_InstructionSetFlags
AddInstructionSet(InstructionSet_POPCNT_X64);
if (HasInstructionSet(InstructionSet_AVXVNNI))
AddInstructionSet(InstructionSet_AVXVNNI_X64);
if (HasInstructionSet(InstructionSet_MOVBE))
AddInstructionSet(InstructionSet_MOVBE_X64);
#endif // TARGET_AMD64
#ifdef TARGET_X86
#endif // TARGET_X86
Expand Down Expand Up @@ -357,6 +363,10 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI_X64) && !resultflags.HasInstructionSet(InstructionSet_AVXVNNI))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI_X64);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_MOVBE_X64))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE_X64) && !resultflags.HasInstructionSet(InstructionSet_MOVBE))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE_X64);
if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_SSE);
if (resultflags.HasInstructionSet(InstructionSet_SSE2) && !resultflags.HasInstructionSet(InstructionSet_SSE))
Expand Down Expand Up @@ -393,6 +403,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_Vector256);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE);
#endif // TARGET_AMD64
#ifdef TARGET_X86
if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
Expand Down Expand Up @@ -431,6 +443,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_Vector256);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE);
#endif // TARGET_X86

} while (!oldflags.Equals(resultflags));
Expand Down Expand Up @@ -563,6 +577,10 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "AVXVNNI";
case InstructionSet_AVXVNNI_X64 :
return "AVXVNNI_X64";
case InstructionSet_MOVBE :
return "MOVBE";
case InstructionSet_MOVBE_X64 :
return "MOVBE_X64";
#endif // TARGET_AMD64
#ifdef TARGET_X86
case InstructionSet_X86Base :
Expand Down Expand Up @@ -603,6 +621,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "Vector256";
case InstructionSet_AVXVNNI :
return "AVXVNNI";
case InstructionSet_MOVBE :
return "MOVBE";
#endif // TARGET_X86

default:
Expand Down Expand Up @@ -652,6 +672,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Pclmulqdq: return InstructionSet_PCLMULQDQ;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_POPCNT;
case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI;
case READYTORUN_INSTRUCTION_Movbe: return InstructionSet_MOVBE;
#endif // TARGET_AMD64
#ifdef TARGET_X86
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
Expand All @@ -671,6 +692,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Pclmulqdq: return InstructionSet_PCLMULQDQ;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_POPCNT;
case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI;
case READYTORUN_INSTRUCTION_Movbe: return InstructionSet_MOVBE;
#endif // TARGET_X86

default:
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
#define GUID_DEFINED
#endif // !GUID_DEFINED

constexpr GUID JITEEVersionIdentifier = { /* b0719856-6fe6-407c-bf40-7a57e22b2382 */
0xb0719856,
0x6fe6,
0x407c,
{0xbf, 0x40, 0x7a, 0x57, 0xe2, 0x2b, 0x23, 0x82}
constexpr GUID JITEEVersionIdentifier = { /* dfda4767-9618-4d6a-8105-a86034dc52eb */
0xdfda4767,
0x9618,
0x4d6a,
{0x81, 0x05, 0xa8, 0x60, 0x34, 0xdc, 0x52, 0xeb}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/inc/readytoruninstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum ReadyToRunInstructionSet
READYTORUN_INSTRUCTION_Rdm=24,
READYTORUN_INSTRUCTION_AvxVnni=25,
READYTORUN_INSTRUCTION_Rcpc=26,
READYTORUN_INSTRUCTION_Movbe=27,

};

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1653,7 +1653,7 @@ void CodeGen::genConsumeRegs(GenTree* tree)
}
#endif // FEATURE_HW_INTRINSICS
#endif // TARGET_XARCH
else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH))
else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH, GT_BSWAP, GT_BSWAP16))
{
genConsumeRegs(tree->gtGetOp1());
}
Expand Down
40 changes: 23 additions & 17 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,35 +553,38 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
//
void CodeGen::genCodeForBswap(GenTree* tree)
{
// TODO: If we're swapping immediately after a read from memory or immediately before
// a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
// the platform supports it.

assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));

regNumber targetReg = tree->GetRegNum();
var_types targetType = tree->TypeGet();

GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);

inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
genConsumeRegs(operand);

if (tree->OperIs(GT_BSWAP))
if (operand->isUsedFromReg())
{
// 32-bit and 64-bit byte swaps use "bswap reg"
inst_RV(INS_bswap, targetReg, targetType);
inst_Mov(targetType, targetReg, operand->GetRegNum(), /* canSkip */ true);

if (tree->OperIs(GT_BSWAP))
{
// 32-bit and 64-bit byte swaps use "bswap reg"
inst_RV(INS_bswap, targetReg, targetType);
}
else
{
// 16-bit byte swaps use "ror reg.16, 8"
inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
}
}
else
{
// 16-bit byte swaps use "ror reg.16, 8"
inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand);
}

if (!genCanOmitNormalizationForBswap16(tree))
{
GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false);
}
if (tree->OperIs(GT_BSWAP16) && !genCanOmitNormalizationForBswap16(tree))
{
GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false);
}

genProduceReg(tree);
Expand Down Expand Up @@ -5142,7 +5145,10 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
}
else
{
GetEmitter()->emitInsStoreInd(ins_Store(data->TypeGet()), emitTypeSize(tree), tree);
GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained()
? INS_movbe
: ins_Store(data->TypeGet()),
emitTypeSize(tree), tree);
}
}
}
Expand Down
42 changes: 42 additions & 0 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3339,6 +3339,13 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m
GenTree* addr = mem->Addr();
GenTree* data = mem->Data();

if (data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained())
{
assert(ins == INS_movbe);

data = data->gtGetOp1();
}

if (addr->OperGet() == GT_CLS_VAR_ADDR)
{
if (data->isContainedIntOrIImmed())
Expand Down Expand Up @@ -10422,6 +10429,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
{
assert(ins == INS_movbe);

dst += emitOutputByte(dst, 0x66);
}

// Output the REX prefix
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);

Expand Down Expand Up @@ -11206,6 +11220,13 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
{
assert(ins == INS_movbe);

dst += emitOutputByte(dst, 0x66);
}

// Output the REX prefix
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);

Expand Down Expand Up @@ -11676,6 +11697,13 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
{
assert(ins == INS_movbe);

dst += emitOutputByte(dst, 0x66);
}

// Output the REX prefix
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);

Expand Down Expand Up @@ -16282,6 +16310,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
}

case INS_movbe:
if (memAccessKind == PERFSCORE_MEMORY_READ)
{
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C;
}
else
{
assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C;
}
break;

default:
// unhandled instruction insFmt combination
perfScoreUnhandledInstruction(id, &result);
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868,
#define SSEDBL(c) PACK3(0xf2, 0x0f, c)
#define PCKDBL(c) PACK3(0x66, 0x0f, c)
#define PCKFLT(c) PACK2(0x0f,c)
#define PCKMVB(c) PACK3(0x0F, 0x38, c)

// These macros encode extra byte that is implicit in the macro.
#define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8))
Expand Down Expand Up @@ -618,6 +619,9 @@ INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE,
// LZCNT
INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF )

// MOVBE
INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_FLAGS_None )

// POPCNT
INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF )

Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,13 @@ GenTree* Lowering::LowerNode(GenTree* node)
}
break;

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
case GT_BSWAP:
case GT_BSWAP16:
LowerBswapOp(node->AsOp());
break;
#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH

default:
break;
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ class Lowering final : public Phase
GenTree* TryLowerAndOpToResetLowestSetBit(GenTreeOp* andNode);
GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode);
GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode);
void LowerBswapOp(GenTreeOp* node);
#elif defined(TARGET_ARM64)
bool IsValidConstForMovImm(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node);
Expand Down
Loading