Skip to content

Commit 7b1fd25

Browse files
authored
Add xarch blsmsk (#66561)
1 parent 5c559f1 commit 7b1fd25

File tree

4 files changed

+134
-35
lines changed

4 files changed

+134
-35
lines changed

src/coreclr/jit/instrsxarch.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -594,8 +594,8 @@ INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BA
594594
// BMI1
595595
INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
596596
INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND NOT
597-
INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Extract Lowest Set Isolated Bit
598-
INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_Flags_IsDstDstSrcAVXInstruction) // Get Mask Up to Lowest Set Bit
597+
INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Extract Lowest Set Isolated Bit
598+
INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Get Mask Up to Lowest Set Bit
599599
INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_IsDstDstSrcAVXInstruction) // Reset Lowest Set Bit
600600
INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_Flags_IsDstDstSrcAVXInstruction) // Bit Field Extract
601601

src/coreclr/jit/lower.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ class Lowering final : public Phase
351351
GenTree* TryLowerAndOpToResetLowestSetBit(GenTreeOp* andNode);
352352
GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode);
353353
GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode);
354+
GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode);
354355
void LowerBswapOp(GenTreeOp* node);
355356
#elif defined(TARGET_ARM64)
356357
bool IsValidConstForMovImm(GenTreeHWIntrinsic* node);

src/coreclr/jit/lowerxarch.cpp

Lines changed: 111 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -174,24 +174,35 @@ GenTree* Lowering::LowerMul(GenTreeOp* mul)
174174
GenTree* Lowering::LowerBinaryArithmetic(GenTreeOp* binOp)
175175
{
176176
#ifdef FEATURE_HW_INTRINSICS
177-
if (comp->opts.OptimizationEnabled() && binOp->OperIs(GT_AND) && varTypeIsIntegral(binOp))
177+
if (comp->opts.OptimizationEnabled() && varTypeIsIntegral(binOp))
178178
{
179-
GenTree* replacementNode = TryLowerAndOpToAndNot(binOp);
180-
if (replacementNode != nullptr)
179+
if (binOp->OperIs(GT_AND))
181180
{
182-
return replacementNode->gtNext;
183-
}
181+
GenTree* replacementNode = TryLowerAndOpToAndNot(binOp);
182+
if (replacementNode != nullptr)
183+
{
184+
return replacementNode->gtNext;
185+
}
184186

185-
replacementNode = TryLowerAndOpToResetLowestSetBit(binOp);
186-
if (replacementNode != nullptr)
187-
{
188-
return replacementNode->gtNext;
189-
}
187+
replacementNode = TryLowerAndOpToResetLowestSetBit(binOp);
188+
if (replacementNode != nullptr)
189+
{
190+
return replacementNode->gtNext;
191+
}
190192

191-
replacementNode = TryLowerAndOpToExtractLowestSetBit(binOp);
192-
if (replacementNode != nullptr)
193+
replacementNode = TryLowerAndOpToExtractLowestSetBit(binOp);
194+
if (replacementNode != nullptr)
195+
{
196+
return replacementNode->gtNext;
197+
}
198+
}
199+
else if (binOp->OperIs(GT_XOR))
193200
{
194-
return replacementNode->gtNext;
201+
GenTree* replacementNode = TryLowerXorOpToGetMaskUpToLowestSetBit(binOp);
202+
if (replacementNode != nullptr)
203+
{
204+
return replacementNode->gtNext;
205+
}
195206
}
196207
}
197208
#endif
@@ -4056,6 +4067,93 @@ GenTree* Lowering::TryLowerAndOpToAndNot(GenTreeOp* andNode)
40564067
return andnNode;
40574068
}
40584069

4070+
//----------------------------------------------------------------------------------------------
4071+
// Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit: Lowers a tree XOR(X, ADD(X, -1)) to
4072+
// HWIntrinsic::GetMaskUpToLowestSetBit
4073+
//
4074+
// Arguments:
4075+
// xorNode - GT_XOR node of integral type
4076+
//
4077+
// Return Value:
4078+
// Returns the replacement node if one is created else nullptr indicating no replacement
4079+
//
4080+
// Notes:
4081+
// Performs containment checks on the replacement node if one is created
4082+
GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode)
4083+
{
4084+
assert(xorNode->OperIs(GT_XOR) && varTypeIsIntegral(xorNode));
4085+
4086+
GenTree* op1 = xorNode->gtGetOp1();
4087+
if (!op1->OperIs(GT_LCL_VAR) || comp->lvaGetDesc(op1->AsLclVar())->IsAddressExposed())
4088+
{
4089+
return nullptr;
4090+
}
4091+
4092+
GenTree* op2 = xorNode->gtGetOp2();
4093+
if (!op2->OperIs(GT_ADD))
4094+
{
4095+
return nullptr;
4096+
}
4097+
4098+
GenTree* addOp2 = op2->gtGetOp2();
4099+
if (!addOp2->IsIntegralConst(-1))
4100+
{
4101+
return nullptr;
4102+
}
4103+
4104+
GenTree* addOp1 = op2->gtGetOp1();
4105+
if (!addOp1->OperIs(GT_LCL_VAR) || (addOp1->AsLclVar()->GetLclNum() != op1->AsLclVar()->GetLclNum()))
4106+
{
4107+
return nullptr;
4108+
}
4109+
4110+
// Subsequent nodes may rely on CPU flags set by these nodes in which case we cannot remove them
4111+
if (((addOp2->gtFlags & GTF_SET_FLAGS) != 0) || ((op2->gtFlags & GTF_SET_FLAGS) != 0) ||
4112+
((xorNode->gtFlags & GTF_SET_FLAGS) != 0))
4113+
{
4114+
return nullptr;
4115+
}
4116+
4117+
NamedIntrinsic intrinsic;
4118+
if (xorNode->TypeIs(TYP_LONG) && comp->compOpportunisticallyDependsOn(InstructionSet_BMI1_X64))
4119+
{
4120+
intrinsic = NamedIntrinsic::NI_BMI1_X64_GetMaskUpToLowestSetBit;
4121+
}
4122+
else if (comp->compOpportunisticallyDependsOn(InstructionSet_BMI1))
4123+
{
4124+
intrinsic = NamedIntrinsic::NI_BMI1_GetMaskUpToLowestSetBit;
4125+
}
4126+
else
4127+
{
4128+
return nullptr;
4129+
}
4130+
4131+
LIR::Use use;
4132+
if (!BlockRange().TryGetUse(xorNode, &use))
4133+
{
4134+
return nullptr;
4135+
}
4136+
4137+
GenTreeHWIntrinsic* blsmskNode = comp->gtNewScalarHWIntrinsicNode(xorNode->TypeGet(), op1, intrinsic);
4138+
4139+
JITDUMP("Lower: optimize XOR(X, ADD(X, -1)))\n");
4140+
DISPNODE(xorNode);
4141+
JITDUMP("to:\n");
4142+
DISPNODE(blsmskNode);
4143+
4144+
use.ReplaceWith(blsmskNode);
4145+
4146+
BlockRange().InsertBefore(xorNode, blsmskNode);
4147+
BlockRange().Remove(xorNode);
4148+
BlockRange().Remove(op2);
4149+
BlockRange().Remove(addOp1);
4150+
BlockRange().Remove(addOp2);
4151+
4152+
ContainCheckHWIntrinsic(blsmskNode);
4153+
4154+
return blsmskNode;
4155+
}
4156+
40594157
//----------------------------------------------------------------------------------------------
40604158
// Lowering::LowerBswapOp: Tries to contain GT_BSWAP node when possible
40614159
//

src/tests/JIT/Intrinsics/BMI1Intrinsics.cs

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@ static int Main(string[] args)
1212
// bmi1 expression are folded to to hwintrinsics that return identical results
1313

1414
var values = new (uint input1, uint input2, uint andnExpected, uint blsiExpected, uint blsrExpected, uint blmskExpected)[] {
15-
(0, 0, 0, 0 ,0 ,0),
16-
(1, 0, 1, 1 ,0 ,0xfffffffe),
17-
(uint.MaxValue / 2, 0, 0x7fffffff, 0x1 ,0x7ffffffe ,0xfffffffe),
18-
((uint.MaxValue / 2) - 1, 0, 0x7FFFFFFE, 2 ,0x7FFFFFFC ,0xFFFFFFFC),
19-
((uint.MaxValue / 2) + 1, 0, 0x80000000, 0x80000000 ,0 ,0),
20-
(uint.MaxValue - 1, 0, 0xFFFFFFFE, 2 ,0xFFFFFFFC ,0xFFFFFFFC),
21-
(uint.MaxValue , 0, 0xFFFFFFFF, 1 ,0xFFFFFFFE ,0xFFFFFFFE),
22-
(0xAAAAAAAA,0xAAAAAAAA,0,2,0xAAAAAAA8,0xFFFFFFFC),
23-
(0xAAAAAAAA,0x55555555,0xAAAAAAAA,2,0xAAAAAAA8,0xFFFFFFFC),
15+
(0, 0, 0, 0 ,0 ,0xFFFFFFFF),
16+
(1, 0, 1, 1 ,0 ,1),
17+
(uint.MaxValue / 2, 0, 0x7fffffff, 0x1 ,0x7ffffffe ,1),
18+
((uint.MaxValue / 2) - 1, 0, 0x7FFFFFFE, 2 ,0x7FFFFFFC ,3),
19+
((uint.MaxValue / 2) + 1, 0, 0x80000000, 0x80000000 ,0 ,0xFFFFFFFF),
20+
(uint.MaxValue - 1, 0, 0xFFFFFFFE, 2 ,0xFFFFFFFC ,3),
21+
(uint.MaxValue , 0, 0xFFFFFFFF, 1 ,0xFFFFFFFE ,1),
22+
(0xAAAAAAAA,0xAAAAAAAA,0,2,0xAAAAAAA8,3),
23+
(0xAAAAAAAA,0x55555555,0xAAAAAAAA,2,0xAAAAAAA8,3),
2424
};
2525

2626
foreach (var value in values)
@@ -33,15 +33,15 @@ static int Main(string[] args)
3333

3434

3535
var values2 = new (ulong input1, ulong input2, ulong andnExpected, ulong blsiExpected, ulong blsrExpected, ulong blmskExpected)[] {
36-
(0, 0, 0, 0, 0, 0),
37-
(1, 0, 1, 1, 0,0xFFFFFFFF_FFFFFFFE),
38-
(ulong.MaxValue / 2, 0,0x7FFFFFFF_FFFFFFFF, 1,0x7FFFFFFF_FFFFFFFE,0xFFFFFFFF_FFFFFFFE),
39-
((ulong.MaxValue / 2) - 1, 0,0x7FFFFFFF_FFFFFFFE, 2,0x7FFFFFFF_FFFFFFFC,0xFFFFFFFF_FFFFFFFC),
40-
((ulong.MaxValue / 2) + 1, 0,0x80000000_00000000,0x80000000_00000000, 0, 0),
41-
(ulong.MaxValue - 1, 0,0xFFFFFFFF_FFFFFFFE, 2,0xFFFFFFFF_FFFFFFFC,0xFFFFFFFF_FFFFFFFC),
42-
(ulong.MaxValue, 0,0xFFFFFFFF_FFFFFFFF, 1,0xFFFFFFFF_FFFFFFFE,0xFFFFFFFF_FFFFFFFE),
43-
(0xAAAAAAAA_AAAAAAAA,0xAAAAAAAA_AAAAAAAA, 0, 2,0xAAAAAAAA_AAAAAAA8,0xFFFFFFFF_FFFFFFFC),
44-
(0xAAAAAAAA_AAAAAAAA,0x55555555_55555555,0xAAAAAAAA_AAAAAAAA, 2,0xAAAAAAAA_AAAAAAA8,0xFFFFFFFF_FFFFFFFC),
36+
(0, 0, 0, 0, 0,0xFFFFFFFF_FFFFFFFF),
37+
(1, 0, 1, 1, 0, 1),
38+
(ulong.MaxValue / 2, 0,0x7FFFFFFF_FFFFFFFF, 1,0x7FFFFFFF_FFFFFFFE, 1),
39+
((ulong.MaxValue / 2) - 1, 0,0x7FFFFFFF_FFFFFFFE, 2,0x7FFFFFFF_FFFFFFFC, 3),
40+
((ulong.MaxValue / 2) + 1, 0,0x80000000_00000000,0x80000000_00000000, 0,0xFFFFFFFF_FFFFFFFF),
41+
(ulong.MaxValue - 1, 0,0xFFFFFFFF_FFFFFFFE, 2,0xFFFFFFFF_FFFFFFFC, 3),
42+
(ulong.MaxValue, 0,0xFFFFFFFF_FFFFFFFF, 1,0xFFFFFFFF_FFFFFFFE, 1),
43+
(0xAAAAAAAA_AAAAAAAA,0xAAAAAAAA_AAAAAAAA, 0, 2,0xAAAAAAAA_AAAAAAA8, 3),
44+
(0xAAAAAAAA_AAAAAAAA,0x55555555_55555555,0xAAAAAAAA_AAAAAAAA, 2,0xAAAAAAAA_AAAAAAA8, 3),
4545
};
4646

4747
foreach (var value in values2)
@@ -74,10 +74,10 @@ static int Main(string[] args)
7474
private static ulong ResetLowestSetBit_64bit(ulong x) => x & (x - 1); // bmi1 blsr
7575

7676
[MethodImpl(MethodImplOptions.NoInlining)]
77-
private static uint GetMaskUpToLowestSetBit_32bit(uint x) => (uint)(x ^ (-x)); // bmi1 blmsk
77+
private static uint GetMaskUpToLowestSetBit_32bit(uint x) => x ^ (x - 1); // bmi1 blsmsk
7878

7979
[MethodImpl(MethodImplOptions.NoInlining)]
80-
private static ulong GetMaskUpToLowestSetBit_64bit(ulong x) => x ^ (ulong)(-(long)x); // bmi1 blmsk
80+
private static ulong GetMaskUpToLowestSetBit_64bit(ulong x) => x ^ (x - 1); // bmi1 blsmsk
8181

8282
[MethodImpl(MethodImplOptions.NoInlining)]
8383
private static void Test(uint input, uint output, uint expected, string callerName)

0 commit comments

Comments
 (0)