Skip to content

Commit 26a6f55

Browse files
authored
Update how OSR and PGO interact (#61453)
When both OSR and PGO are enabled: * Enable instrumenting OSR methods, so that the combined profile data from Tier0 plus any OSR variants provide a full picture for subsequent Tier1 optimization. * Use block profiles for both Tier0 methods that are likely to have patchpoints and OSR methods. * Fix phase ordering so partially jitted methods don't lose probes. * A few more fixes for partial compilation, because the number of things we think we might instrument and the number of things we end up instrumenting can differ. * Also improve the DumpJittedMethod output for OSR, and allow selective dumping of a particular OSR variant by specifying its IL offset. The updates on the runtime side are to pass BBINSTR to OSR methods, and to handle the (typical) case where the OSR method instrumentation schema is a subset of the Tier0 method schema. We are still allowing OSR methods to read the profile data. So they are both profile instrumented and profile optimized. Not clear if this is going to work well as the Tier0 data will be incomplete and optimization quality may be poor. Something to revisit down the road.
1 parent a4bb83a commit 26a6f55

File tree

11 files changed

+192
-48
lines changed

11 files changed

+192
-48
lines changed

eng/pipelines/common/templates/runtimes/run-test-job.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,10 @@ jobs:
554554
scenarios:
555555
- jitosr
556556
- jitosr_stress
557+
- jitosr_pgo
557558
- jitpartialcompilation
559+
- jitpartialcompilation_osr
560+
- jitpartialcompilation_osr_pgo
558561
- jitobjectstackallocation
559562
${{ if in(parameters.testGroup, 'ilasm') }}:
560563
scenarios:

src/coreclr/inc/pgo_formatprocessing.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,35 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax,
361361
});
362362
}
363363

364+
365+
// Return true if schemaTable entries are a subset of the schema described by pByte, with matching entries in the same order.
366+
// Also updates offset of the matching entries in schemaTable to those of the pByte schema.
367+
//
368+
inline bool CheckIfPgoSchemaIsCompatibleAndSetOffsets(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas)
369+
{
370+
size_t nMatched = 0;
371+
size_t initialOffset = cbDataMax;
372+
373+
auto handler = [schemaTable, cSchemas, &nMatched](const ICorJitInfo::PgoInstrumentationSchema& schema)
374+
{
375+
if ((nMatched < cSchemas)
376+
&& (schema.InstrumentationKind == schemaTable[nMatched].InstrumentationKind)
377+
&& (schema.ILOffset == schemaTable[nMatched].ILOffset)
378+
&& (schema.Count == schemaTable[nMatched].Count)
379+
&& (schema.Other == schemaTable[nMatched].Other))
380+
{
381+
schemaTable[nMatched].Offset = schema.Offset;
382+
nMatched++;
383+
}
384+
385+
return true;
386+
};
387+
388+
ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, handler);
389+
390+
return (nMatched == cSchemas);
391+
}
392+
364393
inline bool ReadInstrumentationSchemaWithLayoutIntoSArray(const uint8_t *pByte, size_t cbDataMax, size_t initialOffset, SArray<ICorJitInfo::PgoInstrumentationSchema>* pSchemas)
365394
{
366395
auto lambda = [pSchemas](const ICorJitInfo::PgoInstrumentationSchema &schema)

src/coreclr/jit/compiler.cpp

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2777,6 +2777,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
27772777
verboseDump = (JitConfig.JitDumpTier0() > 0);
27782778
}
27792779

2780+
// Optionally suppress dumping some OSR jit requests.
2781+
//
2782+
if (verboseDump && jitFlags->IsSet(JitFlags::JIT_FLAG_OSR))
2783+
{
2784+
const int desiredOffset = JitConfig.JitDumpAtOSROffset();
2785+
2786+
if (desiredOffset != -1)
2787+
{
2788+
verboseDump = (((IL_OFFSET)desiredOffset) == info.compILEntry);
2789+
}
2790+
}
2791+
27802792
if (verboseDump)
27812793
{
27822794
verbose = true;
@@ -4447,6 +4459,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
44474459
//
44484460
DoPhase(this, PHASE_IMPORTATION, &Compiler::fgImport);
44494461

4462+
// Expand any patchpoints
4463+
//
4464+
DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints);
4465+
44504466
// If instrumenting, add block and class probes.
44514467
//
44524468
if (compileFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR))
@@ -4458,10 +4474,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
44584474
//
44594475
DoPhase(this, PHASE_INDXCALL, &Compiler::fgTransformIndirectCalls);
44604476

4461-
// Expand any patchpoints
4462-
//
4463-
DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints);
4464-
44654477
// PostImportPhase: cleanup inlinees
44664478
//
44674479
auto postImportPhase = [this]() {
@@ -6375,9 +6387,21 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr,
63756387
#ifdef DEBUG
63766388
if ((JitConfig.DumpJittedMethods() == 1) && !compIsForInlining())
63776389
{
6390+
enum
6391+
{
6392+
BUFSIZE = 20
6393+
};
6394+
char osrBuffer[BUFSIZE] = {0};
6395+
if (opts.IsOSR())
6396+
{
6397+
// Tiering name already includes "OSR", we just want the IL offset
6398+
//
6399+
sprintf_s(osrBuffer, BUFSIZE, " @0x%x", info.compILEntry);
6400+
}
6401+
63786402
printf("Compiling %4d %s::%s, IL size = %u, hash=0x%08x %s%s%s\n", Compiler::jitTotalMethodCompiled,
63796403
info.compClassName, info.compMethodName, info.compILCodeSize, info.compMethodHash(),
6380-
compGetTieringName(), opts.IsOSR() ? " OSR" : "", compGetStressMessage());
6404+
compGetTieringName(), osrBuffer, compGetStressMessage());
63816405
}
63826406
if (compIsForInlining())
63836407
{

src/coreclr/jit/fgopt.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1553,7 +1553,18 @@ void Compiler::fgPostImportationCleanup()
15531553
//
15541554
auto addConditionalFlow = [this, entryStateVar, &entryJumpTarget](BasicBlock* fromBlock,
15551555
BasicBlock* toBlock) {
1556-
fgSplitBlockAtBeginning(fromBlock);
1556+
1557+
// We may have previously though this try entry was unreachable, but now we're going to
1558+
// step through it on the way to the OSR entry. So ensure it has plausible profile weight.
1559+
//
1560+
if (fgHaveProfileData() && !fromBlock->hasProfileWeight())
1561+
{
1562+
JITDUMP("Updating block weight for now-reachable try entry " FMT_BB " via " FMT_BB "\n",
1563+
fromBlock->bbNum, fgFirstBB->bbNum);
1564+
fromBlock->inheritWeight(fgFirstBB);
1565+
}
1566+
1567+
BasicBlock* const newBlock = fgSplitBlockAtBeginning(fromBlock);
15571568
fromBlock->bbFlags |= BBF_INTERNAL;
15581569

15591570
GenTree* const entryStateLcl = gtNewLclvNode(entryStateVar, TYP_INT);
@@ -1565,6 +1576,7 @@ void Compiler::fgPostImportationCleanup()
15651576
fromBlock->bbJumpKind = BBJ_COND;
15661577
fromBlock->bbJumpDest = toBlock;
15671578
fgAddRefPred(toBlock, fromBlock);
1579+
newBlock->inheritWeight(fromBlock);
15681580

15691581
entryJumpTarget = fromBlock;
15701582
};

src/coreclr/jit/fgprofile.cpp

Lines changed: 83 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -300,10 +300,10 @@ class Instrumentor
300300
virtual void BuildSchemaElements(BasicBlock* block, Schema& schema)
301301
{
302302
}
303-
virtual void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
303+
virtual void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
304304
{
305305
}
306-
virtual void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
306+
virtual void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
307307
{
308308
}
309309
virtual void SuppressProbes()
@@ -349,8 +349,8 @@ class BlockCountInstrumentor : public Instrumentor
349349
}
350350
void Prepare(bool isPreImport) override;
351351
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
352-
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
353-
void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override;
352+
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
353+
void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) override;
354354
};
355355

356356
//------------------------------------------------------------------------
@@ -428,7 +428,7 @@ void BlockCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
428428
// schema -- instrumentation schema
429429
// profileMemory -- profile data slab
430430
//
431-
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
431+
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
432432
{
433433
const ICorJitInfo::PgoInstrumentationSchema& entry = schema[block->bbCountSchemaIndex];
434434

@@ -464,7 +464,7 @@ void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE*
464464
// Notes:
465465
// When prejitting, add the method entry callback node
466466
//
467-
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
467+
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
468468
{
469469
Compiler::Options& opts = m_comp->opts;
470470
Compiler::Info& info = m_comp->info;
@@ -1002,7 +1002,7 @@ class EfficientEdgeCountInstrumentor : public Instrumentor, public SpanningTreeV
10021002
return ((block->bbFlags & BBF_IMPORTED) == BBF_IMPORTED);
10031003
}
10041004
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
1005-
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
1005+
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
10061006

10071007
void Badcode() override
10081008
{
@@ -1136,7 +1136,7 @@ void EfficientEdgeCountInstrumentor::BuildSchemaElements(BasicBlock* block, Sche
11361136
// schema -- instrumentation schema
11371137
// profileMemory -- profile data slab
11381138
//
1139-
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
1139+
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
11401140
{
11411141
// Inlinee compilers build their blocks in the root compiler's
11421142
// graph. So for NumSucc, we use the root compiler instance.
@@ -1311,12 +1311,12 @@ class BuildClassProbeSchemaGen
13111311
class ClassProbeInserter
13121312
{
13131313
Schema& m_schema;
1314-
BYTE* m_profileMemory;
1314+
uint8_t* m_profileMemory;
13151315
int* m_currentSchemaIndex;
13161316
unsigned& m_instrCount;
13171317

13181318
public:
1319-
ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
1319+
ClassProbeInserter(Schema& schema, uint8_t* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
13201320
: m_schema(schema)
13211321
, m_profileMemory(profileMemory)
13221322
, m_currentSchemaIndex(pCurrentSchemaIndex)
@@ -1353,7 +1353,7 @@ class ClassProbeInserter
13531353

13541354
// Figure out where the table is located.
13551355
//
1356-
BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
1356+
uint8_t* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
13571357
*m_currentSchemaIndex += 2; // There are 2 schema entries per class probe
13581358

13591359
// Grab a temp to hold the 'this' object as it will be used three times
@@ -1430,7 +1430,7 @@ class ClassProbeInstrumentor : public Instrumentor
14301430
}
14311431
void Prepare(bool isPreImport) override;
14321432
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
1433-
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
1433+
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
14341434
void SuppressProbes() override;
14351435
};
14361436

@@ -1494,7 +1494,7 @@ void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
14941494
// schema -- instrumentation schema
14951495
// profileMemory -- profile data slab
14961496
//
1497-
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
1497+
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
14981498
{
14991499
if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
15001500
{
@@ -1567,21 +1567,43 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
15671567
// Choose instrumentation technology.
15681568
//
15691569
// We enable edge profiling by default, except when:
1570+
//
15701571
// * disabled by option
15711572
// * we are prejitting
1572-
// * we are jitting osr methods
1573+
// * we are jitting tier0 methods with patchpoints
1574+
// * we are jitting an OSR method
15731575
//
1574-
// Currently, OSR is incompatible with edge profiling. So if OSR is enabled,
1575-
// always do block profiling.
1576+
// OSR is incompatible with edge profiling. Only portions of the Tier0
1577+
// method will be executed, and the bail-outs at patchpoints won't be obvious
1578+
// exit points from the method. So for OSR we always do block profiling.
15761579
//
15771580
// Note this incompatibility only exists for methods that actually have
1578-
// patchpoints, but we won't know that until we import.
1581+
// patchpoints. Currently we will only place patchponts in methods with
1582+
// backwards jumps.
1583+
//
1584+
// And because we want the Tier1 method to see the full set of profile data,
1585+
// when OSR is enabled, both Tier0 and any OSR methods need to contribute to
1586+
// the same profile data set. Since Tier0 has laid down a dense block-based
1587+
// schema, the OSR methods must use this schema as well.
1588+
//
1589+
// Note that OSR methods may also inline. We currently won't instrument
1590+
// any inlinee contributions (which would also need to carefully "share"
1591+
// the profile data segment with any Tier0 version and/or any other equivalent
1592+
// inlnee), so we'll lose a bit of their profile data. We can support this
1593+
// eventually if it turns out to matter.
1594+
//
1595+
// Similar issues arise with partially jitted methods. Because we currently
1596+
// only defer jitting for throw blocks, we currently ignore the impact of partial
1597+
// jitting on PGO. If we ever implement a broader pattern of deferral -- say deferring
1598+
// based on static PGO -- we will need to reconsider.
15791599
//
15801600
CLANG_FORMAT_COMMENT_ANCHOR;
15811601

1582-
const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
1583-
const bool osr = (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && (JitConfig.TC_OnStackReplacement() > 0));
1584-
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !osr;
1602+
const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
1603+
const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) &&
1604+
(JitConfig.TC_OnStackReplacement() > 0) && compHasBackwardJump;
1605+
const bool osrMethod = opts.IsOSR();
1606+
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod;
15851607

15861608
if (useEdgeProfiles)
15871609
{
@@ -1590,7 +1612,9 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
15901612
else
15911613
{
15921614
JITDUMP("Using block profiling, because %s\n",
1593-
(JitConfig.JitEdgeProfiling() > 0) ? "edge profiles disabled" : prejit ? "prejitting" : "OSR");
1615+
(JitConfig.JitEdgeProfiling() > 0)
1616+
? "edge profiles disabled"
1617+
: prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints");
15941618

15951619
fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
15961620
}
@@ -1640,7 +1664,7 @@ PhaseStatus Compiler::fgInstrumentMethod()
16401664
{
16411665
noway_assert(!compIsForInlining());
16421666

1643-
// Make post-importpreparations.
1667+
// Make post-import preparations.
16441668
//
16451669
const bool isPreImport = false;
16461670
fgCountInstrumentor->Prepare(isPreImport);
@@ -1665,7 +1689,17 @@ PhaseStatus Compiler::fgInstrumentMethod()
16651689
// Verify we created schema for the calls needing class probes.
16661690
// (we counted those when importing)
16671691
//
1668-
assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
1692+
// This is not true when we do partial compilation; it can/will erase class probes,
1693+
// and there's no easy way to figure out how many should be left.
1694+
//
1695+
if (doesMethodHavePartialCompilationPatchpoints())
1696+
{
1697+
assert(fgClassInstrumentor->SchemaCount() <= info.compClassProbeCount);
1698+
}
1699+
else
1700+
{
1701+
assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
1702+
}
16691703

16701704
// Optionally, when jitting, if there were no class probes and only one count probe,
16711705
// suppress instrumentation.
@@ -1698,11 +1732,16 @@ PhaseStatus Compiler::fgInstrumentMethod()
16981732

16991733
assert(schema.size() > 0);
17001734

1701-
// Allocate the profile buffer
1735+
// Allocate/retrieve the profile buffer.
17021736
//
1703-
BYTE* profileMemory;
1704-
1705-
HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
1737+
// If this is an OSR method, we should use the same buffer that the Tier0 method used.
1738+
//
1739+
// This is supported by allocPgoInsrumentationDataBySchema, which will verify the schema
1740+
// we provide here matches the one from Tier0, and will fill in the data offsets in
1741+
// our schema properly.
1742+
//
1743+
uint8_t* profileMemory;
1744+
HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
17061745
(UINT32)schema.size(), &profileMemory);
17071746

17081747
// Deal with allocation failures.
@@ -1924,6 +1963,14 @@ void Compiler::fgIncorporateBlockCounts()
19241963
fgSetProfileWeight(block, profileWeight);
19251964
}
19261965
}
1966+
1967+
// For OSR, give the method entry (which will be a scratch BB)
1968+
// the same weight as the OSR Entry.
1969+
//
1970+
if (opts.IsOSR())
1971+
{
1972+
fgFirstBB->inheritWeight(fgOSREntryBB);
1973+
}
19271974
}
19281975

19291976
//------------------------------------------------------------------------
@@ -3277,11 +3324,17 @@ void Compiler::fgComputeCalledCount(weight_t returnWeight)
32773324

32783325
BasicBlock* firstILBlock = fgFirstBB; // The first block for IL code (i.e. for the IL code at offset 0)
32793326

3280-
// Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
3327+
// OSR methods can have complex entry flow, and so
3328+
// for OSR we ensure fgFirstBB has plausible profile data.
32813329
//
3282-
while (firstILBlock->bbFlags & BBF_INTERNAL)
3330+
if (!opts.IsOSR())
32833331
{
3284-
firstILBlock = firstILBlock->bbNext;
3332+
// Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
3333+
//
3334+
while (firstILBlock->bbFlags & BBF_INTERNAL)
3335+
{
3336+
firstILBlock = firstILBlock->bbNext;
3337+
}
32853338
}
32863339

32873340
// The 'firstILBlock' is now expected to have a profile-derived weight

0 commit comments

Comments
 (0)