From 1a17140fa76504fb107871ae644147032b761090 Mon Sep 17 00:00:00 2001
From: Koundinya Veluri <kouvel@microsoft.com>
Date: Wed, 9 Jun 2021 22:54:45 -0700
Subject: [PATCH 1/5] Update spin-wait pause/yield normalization

- Modified the measurement to use much less time and to remeasure periodically to reduce CPU usage during startup
- Each measurement does a low-microsecond-level measurement of pause/yield times
- Some small amount of history of recent measurements is retained and used to for now take the lowest measurement for normalization
- Measurements are done lazily, and at most every few seconds another measurement is taken
- Added a profiling event that includes info about a measurement and the established value from recent measurements that is used for normalization
---
 .../src/System/Threading/Thread.CoreCLR.cs    |  23 +-
 src/coreclr/inc/yieldprocessornormalized.h    | 119 ++++++--
 .../utilcode/yieldprocessornormalized.cpp     |  16 +-
 src/coreclr/vm/CMakeLists.txt                 |   2 +-
 src/coreclr/vm/ClrEtwAll.man                  |  27 +-
 src/coreclr/vm/ClrEtwAllMeta.lst              |   8 +-
 src/coreclr/vm/comsynchronizable.cpp          |  17 +-
 src/coreclr/vm/comsynchronizable.h            |   2 +-
 src/coreclr/vm/ecalllist.h                    |   2 +-
 src/coreclr/vm/eventtrace.cpp                 |   6 +
 src/coreclr/vm/finalizerthread.cpp            |   5 -
 src/coreclr/vm/threads.cpp                    |   9 +-
 src/coreclr/vm/yieldprocessornormalized.cpp   | 285 ++++++++++++++----
 .../System/Threading/LowLevelSpinWaiter.cs    |   4 -
 .../src/System/Threading/SpinWait.cs          |   4 -
 15 files changed, 391 insertions(+), 138 deletions(-)
diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index c1a968ed9f9948..83be00cf5e03f6 100644
--- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -322,33 +322,14 @@ public void DisableComObjectEagerCleanup()
         [MethodImpl(MethodImplOptions.InternalCall)]
         public extern bool Join(int millisecondsTimeout);
 
-        private static int s_optimalMaxSpinWaitsPerSpinIteration;
-
-        [DllImport(RuntimeHelpers.QCall)]
-        private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();
-
         /// <summary>
         /// Max value to be passed into <see cref="SpinWait(int)"/> for optimal delaying. This value is normalized to be
         /// appropriate for the processor.
         /// </summary>
         internal static int OptimalMaxSpinWaitsPerSpinIteration
         {
-            get
-            {
-                int optimalMaxSpinWaitsPerSpinIteration = s_optimalMaxSpinWaitsPerSpinIteration;
-                return optimalMaxSpinWaitsPerSpinIteration != 0 ? optimalMaxSpinWaitsPerSpinIteration : CalculateOptimalMaxSpinWaitsPerSpinIteration();
-            }
-        }
-
-        [MethodImpl(MethodImplOptions.NoInlining)]
-        private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
-        {
-            // This is done lazily because the first call to the function below in the process triggers a measurement that
-            // takes a nontrivial amount of time if the measurement has not already been done in the backgorund.
-            // See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
-            s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
-            Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
-            return s_optimalMaxSpinWaitsPerSpinIteration;
+            [MethodImpl(MethodImplOptions.InternalCall)]
+            get;
         }
 
         [MethodImpl(MethodImplOptions.InternalCall)]
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index ba349bb83ad567..ca9e7b2ae3e982 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -12,14 +12,58 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
 #endif
 #define YieldProcessor Dont_Use_YieldProcessor
 
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+#define DISABLE_COPY(T) \
+    T(const T &) = delete; \
+    T &operator =(const T &) = delete
 
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+#define DISABLE_CONSTRUCT_COPY(T) \
+    T() = delete; \
+    DISABLE_COPY(T)
 
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
+class YieldProcessorNormalization
+{
+public:
+    static const unsigned int TargetNsPerNormalizedYield = 37;
+    static const unsigned int TargetMaxNsPerSpinIteration = 272;
+
+    // These are maximums for the computed values for normalization based their calculation
+    static const unsigned int MaxYieldsPerNormalizedYield = TargetNsPerNormalizedYield * 10;
+    static const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+        TargetMaxNsPerSpinIteration * 3 / (TargetNsPerNormalizedYield * 2) + 1;
+
+private:
+    static bool s_isMeasurementScheduled;
+
+    static unsigned int s_yieldsPerNormalizedYield;
+    static unsigned int s_optimalMaxNormalizedYieldsPerSpinIteration;
+
+public:
+    static bool IsMeasurementScheduled()
+    {
+        return s_isMeasurementScheduled;
+    }
+
+    static void PerformMeasurement();
+
+private:
+    static void ScheduleMeasurementIfNecessary();
+
+public:
+    static unsigned int GetOptimalMaxNormalizedYieldsPerSpinIteration()
+    {
+        return s_optimalMaxNormalizedYieldsPerSpinIteration;
+    }
+
+    static void FireMeasurementEvents();
+
+private:
+    static double UntornLoad(double *valueRef);
+
+    DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization);
+
+    friend class YieldProcessorNormalizationInfo;
+    friend void YieldProcessorNormalizedForPreSkylakeCount(unsigned int);
+};
 
 class YieldProcessorNormalizationInfo
 {
@@ -30,12 +74,15 @@ class YieldProcessorNormalizationInfo
 
 public:
     YieldProcessorNormalizationInfo()
-        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
-        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        : yieldsPerNormalizedYield(YieldProcessorNormalization::s_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration),
         optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
     {
+        YieldProcessorNormalization::ScheduleMeasurementIfNecessary();
     }
 
+    DISABLE_COPY(YieldProcessorNormalizationInfo);
+
     friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
     friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
     friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
@@ -98,9 +145,8 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 
     if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
             count = MaxCount;
@@ -144,9 +190,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 
     if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
             preSkylakeCount = MaxCount;
@@ -175,7 +220,35 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 //     }
 FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
 {
-    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+    // This function does not forward to the one above because it is used by some code under utilcode, where
+    // YieldProcessorNormalizationInfo cannot be used since normalization does not happen in some of its consumers. So this
+    // version uses the fields in YieldProcessorNormalization directly.
+
+    _ASSERTE(preSkylakeCount != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
+        if (preSkylakeCount > MaxCount)
+        {
+            preSkylakeCount = MaxCount;
+        }
+    }
+
+    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+    SIZE_T n =
+        (SIZE_T)preSkylakeCount *
+        YieldProcessorNormalization::s_yieldsPerNormalizedYield /
+        PreSkylakeCountToSkylakeCountDivisor;
+    if (n == 0)
+    {
+        n = 1;
+    }
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
 }
 
 // See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
@@ -193,15 +266,12 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     const YieldProcessorNormalizationInfo &normalizationInfo,
     unsigned int spinIteration)
 {
-    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
-    // InitializeYieldProcessorNormalized()
-    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
-        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
-    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    // This shift value should be adjusted based on the asserted condition below
+    // This shift value should be adjusted based on the asserted conditions below
     const UINT8 MaxShift = 3;
-    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert_no_msg(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert_no_msg(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     unsigned int n;
     if (spinIteration <= MaxShift &&
@@ -219,3 +289,6 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
         System_YieldProcessor();
     } while (--n != 0);
 }
+
+#undef DISABLE_CONSTRUCT_COPY
+#undef DISABLE_COPY
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index 4242f82792b47d..020d8d7cc79e4e 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -2,8 +2,16 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 #include "stdafx.h"
+#include "yieldprocessornormalized.h"
 
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+bool YieldProcessorNormalization::s_isMeasurementScheduled;
+
+// Defaults are for when normalization has not yet been done
+unsigned int YieldProcessorNormalization::s_yieldsPerNormalizedYield = 1;
+unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration =
+    (unsigned int)
+    (
+        (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration /
+        YieldProcessorNormalization::TargetNsPerNormalizedYield +
+        0.5
+    );
diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt
index 1d682d2a428bbf..f16005660dc345 100644
--- a/src/coreclr/vm/CMakeLists.txt
+++ b/src/coreclr/vm/CMakeLists.txt
@@ -136,7 +136,6 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON
     versionresilienthashcode.cpp
     virtualcallstub.cpp
     win32threadpool.cpp
-    yieldprocessornormalized.cpp
     zapsig.cpp
 )
 
@@ -389,6 +388,7 @@ set(VM_SOURCES_WKS
     threadsuspend.cpp
     typeparse.cpp
     weakreferencenative.cpp
+    yieldprocessornormalized.cpp
     ${VM_SOURCES_GDBJIT}
 )
 
diff --git a/src/coreclr/vm/ClrEtwAll.man b/src/coreclr/vm/ClrEtwAll.man
index d8a275c6da6295..0ffca6cb3cb6ba 100644
--- a/src/coreclr/vm/ClrEtwAll.man
+++ b/src/coreclr/vm/ClrEtwAll.man
@@ -438,7 +438,13 @@
                             <opcode name="Profiler" message="$(string.RuntimePublisher.ProfilerOpcodeMessage)" symbol="CLR_PROFILER_OPCODE" value="11"/>
                         </opcodes>
                     </task>
-                <!--Next available ID is 37-->
+                    <task name="YieldProcessorMeasurement" symbol="CLR_YIELD_PROCESSOR_MEASUREMENT_TASK"
+                          value="37" eventGUID="{B4AFC324-DECE-4B02-86DC-AAB8F22BC1B1}"
+                          message="$(string.RuntimePublisher.YieldProcessorMeasurementTaskMessage)">
+                        <opcodes>
+                        </opcodes>
+                    </task>
+                <!--Next available ID is 38-->
                 </tasks>
                 <!--Maps-->
                 <maps>
@@ -2916,6 +2922,19 @@
                         </Settings>
                       </UserData>
                     </template>
+
+                    <template tid="YieldProcessorMeasurement">
+                      <data name="ClrInstanceID" inType="win:UInt16"/>
+                      <data name="NsPerYield" inType="win:Double"/>
+                      <data name="EstablishedNsPerYield" inType="win:Double"/>
+                      <UserData>
+                        <Settings xmlns="myNs">
+                          <ClrInstanceID> %1 </ClrInstanceID>
+                          <NsPerYield> %2 </NsPerYield>
+                          <EstablishedNsPerYield> %3 </EstablishedNsPerYield>
+                        </Settings>
+                      </UserData>
+                    </template>
                 </templates>
 
                 <events>
@@ -3313,6 +3332,10 @@
                            keywords ="ThreadingKeyword"  opcode="Wait"
                            task="ThreadPoolWorkerThread"
                            symbol="ThreadPoolWorkerThreadWait" message="$(string.RuntimePublisher.ThreadPoolWorkerThreadEventMessage)"/>
+                  
+                    <event value="58" version="0" level="win:Informational" template="YieldProcessorMeasurement"
+                           keywords="ThreadingKeyword" task="YieldProcessorMeasurement" opcode="win:Info"
+                           symbol="YieldProcessorMeasurement" message="$(string.RuntimePublisher.YieldProcessorMeasurementEventMessage)"/>
 
                     <!-- CLR private ThreadPool events -->
                     <event value="60" version="0" level="win:Verbose"  template="ThreadPoolWorkingThreadCount"
@@ -7125,6 +7148,7 @@
                 <string id="RuntimePublisher.WorkerThreadRetirementRetireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
                 <string id="RuntimePublisher.WorkerThreadRetirementUnretireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreadCount=%2;%nClrInstanceID=%3" />
+                <string id="RuntimePublisher.YieldProcessorMeasurementEventMessage" value="ClrInstanceID=%1;%nNsPerYield=%2;%nEstablishedNsPerYield=%3" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentSampleEventMessage" value="Throughput=%1;%nClrInstanceID=%2" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentAdjustmentEventMessage" value="AverageThroughput=%1;%nNewWorkerThreadCount=%2;%nReason=%3;%nClrInstanceID=%4" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentStatsEventMessage" value="Duration=%1;%nThroughput=%2;%nThreadWave=%3;%nThroughputWave=%4;%nThroughputErrorEstimate=%5;%nAverageThroughputErrorEstimate=%6;%nThroughputRatio=%7;%nConfidence=%8;%nNewControlSetting=%9;%nNewThreadWaveMagnitude=%10;%nClrInstanceID=%11" />
@@ -7410,6 +7434,7 @@
                 <string id="RuntimePublisher.JitInstrumentationDataTaskMessage" value="JitInstrumentationData" />
                 <string id="RuntimePublisher.ExecutionCheckpointTaskMessage" value="ExecutionCheckpoint" />
                 <string id="RuntimePublisher.ProfilerTaskMessage" value="Profiler" />
+                <string id="RuntimePublisher.YieldProcessorMeasurementTaskMessage" value="YieldProcessorMeasurement" />
 
                 <string id="RundownPublisher.EEStartupTaskMessage" value="Runtime" />
                 <string id="RundownPublisher.MethodTaskMessage" value="Method" />
diff --git a/src/coreclr/vm/ClrEtwAllMeta.lst b/src/coreclr/vm/ClrEtwAllMeta.lst
index 285e9101c6321d..d565e015a8b050 100644
--- a/src/coreclr/vm/ClrEtwAllMeta.lst
+++ b/src/coreclr/vm/ClrEtwAllMeta.lst
@@ -134,9 +134,9 @@ nomac:GarbageCollection:::GCJoin_V2
 
 nostack:Type:::BulkType
 
-###################
-# Threadpool events
-###################
+#################################
+# Threading and Threadpool events
+#################################
 nomac:WorkerThreadCreation:::WorkerThreadCreate
 noclrinstanceid:WorkerThreadCreation:::WorkerThreadCreate
 nomac:WorkerThreadCreation:::WorkerThreadTerminate
@@ -170,6 +170,8 @@ nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
 nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
 nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
 nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
+nomac:YieldProcessorMeasurement:::YieldProcessorMeasurement
+nostack:YieldProcessorMeasurement:::YieldProcessorMeasurement
 
 ##################
 # Exception events
diff --git a/src/coreclr/vm/comsynchronizable.cpp b/src/coreclr/vm/comsynchronizable.cpp
index 39f00d06741933..15a33c711e7a95 100644
--- a/src/coreclr/vm/comsynchronizable.cpp
+++ b/src/coreclr/vm/comsynchronizable.cpp
@@ -1089,22 +1089,13 @@ FCIMPL1(void, ThreadNative::SetIsThreadpoolThread, ThreadBaseObject* thread)
 }
 FCIMPLEND
 
-INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
+FCIMPL0(INT32, ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
 {
-    QCALL_CONTRACT;
-
-    INT32 optimalMaxNormalizedYieldsPerSpinIteration;
-
-    BEGIN_QCALL;
-
-    // RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
-    EnsureYieldProcessorNormalizedInitialized();
-    optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-    END_QCALL;
+    FCALL_CONTRACT;
 
-    return optimalMaxNormalizedYieldsPerSpinIteration;
+    return (INT32)YieldProcessorNormalization::GetOptimalMaxNormalizedYieldsPerSpinIteration();
 }
+FCIMPLEND
 
 FCIMPL1(void, ThreadNative::SpinWait, int iterations)
 {
diff --git a/src/coreclr/vm/comsynchronizable.h b/src/coreclr/vm/comsynchronizable.h
index e9968201b8bc20..cfab18d9010706 100644
--- a/src/coreclr/vm/comsynchronizable.h
+++ b/src/coreclr/vm/comsynchronizable.h
@@ -86,7 +86,7 @@ friend class ThreadBaseObject;
     UINT64 QCALLTYPE GetProcessDefaultStackSize();
 
     static FCDECL1(INT32,   GetManagedThreadId, ThreadBaseObject* th);
-    static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
+    static FCDECL0(INT32,   GetOptimalMaxSpinWaitsPerSpinIteration);
     static FCDECL1(void,    SpinWait,                       int iterations);
     static BOOL QCALLTYPE YieldThread();
     static FCDECL0(Object*, GetCurrentThread);
diff --git a/src/coreclr/vm/ecalllist.h b/src/coreclr/vm/ecalllist.h
index f77dc75c80b5cd..ea3f65d72917d2 100644
--- a/src/coreclr/vm/ecalllist.h
+++ b/src/coreclr/vm/ecalllist.h
@@ -602,7 +602,7 @@ FCFuncStart(gThreadFuncs)
 #endif // FEATURE_COMINTEROP
     FCFuncElement("Interrupt", ThreadNative::Interrupt)
     FCFuncElement("Join", ThreadNative::Join)
-    QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
+    FCFuncElement("get_OptimalMaxSpinWaitsPerSpinIteration", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
     FCFuncElement("GetCurrentProcessorNumber", ThreadNative::GetCurrentProcessorNumber)
 FCFuncEnd()
 
diff --git a/src/coreclr/vm/eventtrace.cpp b/src/coreclr/vm/eventtrace.cpp
index 14bd22538d2b58..05f5f7509c9791 100644
--- a/src/coreclr/vm/eventtrace.cpp
+++ b/src/coreclr/vm/eventtrace.cpp
@@ -4680,6 +4680,12 @@ extern "C"
             {
                 ETW::EnumerationLog::EnumerateForCaptureState();
             }
+
+            if (g_fEEStarted && !g_fEEShutDown)
+            {
+                // Emit the YieldProcessor measured values at the beginning of the trace
+                YieldProcessorNormalization::FireMeasurementEvents();
+            }
         }
 #ifdef FEATURE_COMINTEROP
         if (ETW_EVENT_ENABLED(MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context, CCWRefCountChange))
diff --git a/src/coreclr/vm/finalizerthread.cpp b/src/coreclr/vm/finalizerthread.cpp
index 1e4dbf913c898e..e8370315e66651 100644
--- a/src/coreclr/vm/finalizerthread.cpp
+++ b/src/coreclr/vm/finalizerthread.cpp
@@ -379,11 +379,6 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
         {
             GetFinalizerThread()->SetBackground(TRUE);
 
-            {
-                GCX_PREEMP();
-                EnsureYieldProcessorNormalizedInitialized();
-            }
-
             while (!fQuitFinalizer)
             {
                 // This will apply any policy for swallowing exceptions during normal
diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp
index fa93110399d397..01ae12d14d3b8c 100644
--- a/src/coreclr/vm/threads.cpp
+++ b/src/coreclr/vm/threads.cpp
@@ -1131,8 +1131,6 @@ void InitThreadManager()
     }
     CONTRACTL_END;
 
-    InitializeYieldProcessorNormalizedCrst();
-
     // All patched helpers should fit into one page.
     // If you hit this assert on retail build, there is most likely problem with BBT script.
     _ASSERTE_ALL_BUILDS("clr/src/VM/threads.cpp", (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart > (ptrdiff_t)0);
@@ -7145,6 +7143,7 @@ BOOL Thread::HaveExtraWorkForFinalizer()
         || Thread::CleanupNeededForFinalizedThread()
         || (m_DetachCount > 0)
         || SystemDomain::System()->RequireAppDomainCleanup()
+        || YieldProcessorNormalization::IsMeasurementScheduled()
         || ThreadStore::s_pThreadStore->ShouldTriggerGCForDeadThreads();
 }
 
@@ -7191,6 +7190,12 @@ void Thread::DoExtraWorkForFinalizer()
     // If there were any TimerInfos waiting to be released, they'll get flushed now
     ThreadpoolMgr::FlushQueueOfTimerInfos();
 
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        GCX_PREEMP();
+        YieldProcessorNormalization::PerformMeasurement();
+    }
+
     ThreadStore::s_pThreadStore->TriggerGCForDeadThreadsIfNecessary();
 }
 
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 91547923310fbd..061d254a429f30 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -2,17 +2,33 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 #include "common.h"
+#include "yieldprocessornormalized.h"
 
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+#ifndef CROSSGEN_COMPILE
 
-void InitializeYieldProcessorNormalizedCrst()
+#include "finalizerthread.h"
+
+enum class NormalizationState : UINT8
 {
-    WRAPPER_NO_CONTRACT;
-    s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);
-}
+    Uninitialized,
+    Initialized,
+    Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static UINT64 s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
 
-static void InitializeYieldProcessorNormalized()
+static unsigned int DetermineMeasureDurationUs()
 {
     CONTRACTL
     {
@@ -22,92 +38,251 @@ static void InitializeYieldProcessorNormalized()
     }
     CONTRACTL_END;
 
-    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
 
-    if (s_isYieldProcessorNormalizedInitialized)
+    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+    // if the overhead seems high relative to the measure duration.
+    unsigned int measureDurationUs = 1;
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+    QueryPerformanceCounter(&li);
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
     {
-        return;
+        measureDurationUs *= 4;
     }
+    return measureDurationUs;
+}
 
-    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
-    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int MeasureDurationMs = 10;
-    const int NsPerSecond = 1000 * 1000 * 1000;
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+    UINT64 ticksPerS = s_performanceCounterTicksPerS;
+    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
 
     LARGE_INTEGER li;
-    if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+
+    for (int i = 0; i < yieldCount; ++i)
     {
-        // High precision clock not available or clock resolution is too low, resort to defaults
-        s_isYieldProcessorNormalizedInitialized = true;
-        return;
+        System_YieldProcessor();
     }
-    ULONGLONG ticksPerSecond = li.QuadPart;
 
-    // Measure the nanosecond delay per yield
-    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
-    unsigned int yieldCount = 0;
     QueryPerformanceCounter(&li);
-    ULONGLONG startTicks = li.QuadPart;
-    ULONGLONG elapsedTicks;
-    do
-    {
-        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
-        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
-        // low microsecond range.
-        for (int i = 0; i < 1000; ++i)
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    while (elapsedTicks < measureDurationTicks)
+    {
+        int nextYieldCount =
+            Max(4,
+                elapsedTicks == 0
+                    ? yieldCount / 4
+                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+        for (int i = 0; i < nextYieldCount; ++i)
         {
             System_YieldProcessor();
         }
-        yieldCount += 1000;
 
         QueryPerformanceCounter(&li);
-        ULONGLONG nowTicks = li.QuadPart;
-        elapsedTicks = nowTicks - startTicks;
-    } while (elapsedTicks < measureDurationTicks);
-    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
-    if (nsPerYield < 1)
+        elapsedTicks = li.QuadPart - startTicks;
+        yieldCount += nextYieldCount;
+    }
+
+    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+    const double MinNsPerYield = 0.1;
+
+    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+    // really take this long. Limit the maximum to keep the recorded values reasonable.
+    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_isMeasurementScheduled);
+
+    double latestNsPerYield;
+    if (s_normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+
+        int nextMeasurementIndex = s_nextMeasurementIndex;
+        s_nsPerYieldMeasurements[nextMeasurementIndex] = latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+        {
+            nextMeasurementIndex = 0;
+        }
+        s_nextMeasurementIndex = nextMeasurementIndex;
+    }
+    else if (s_normalizationState == NormalizationState::Uninitialized)
+    {
+        LARGE_INTEGER li;
+        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+        {
+            // High precision clock not available or clock resolution is too low, resort to defaults
+            s_normalizationState = NormalizationState::Failed;
+            return;
+        }
+        s_performanceCounterTicksPerS = li.QuadPart;
+
+        unsigned int measureDurationUs = DetermineMeasureDurationUs();
+        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+        {
+            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+            s_nsPerYieldMeasurements[i] = latestNsPerYield;
+            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+            {
+                s_establishedNsPerYield = latestNsPerYield;
+            }
+
+            if (i < NsPerYieldMeasurementCount - 1)
+            {
+                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+            }
+        }
+    }
+    else
     {
-        nsPerYield = 1;
+        _ASSERTE(s_normalizationState == NormalizationState::Failed);
+        return;
     }
 
-    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
-    // value is naturally limited to MinNsPerNormalizedYield.
-    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
-    if (yieldsPerNormalizedYield < 1)
+    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
     {
-        yieldsPerNormalizedYield = 1;
+        double nsPerYield = s_nsPerYieldMeasurements[i];
+        if (nsPerYield < establishedNsPerYield)
+        {
+            establishedNsPerYield = nsPerYield;
+        }
     }
-    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+    s_establishedNsPerYield = establishedNsPerYield;
+
+    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    int yieldsPerNormalizedYield = Max(1, (int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
 
     // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
     // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
     // better job of allowing other work to run.
-    int optimalMaxNormalizedYieldsPerSpinIteration =
-        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
-    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    s_optimalMaxNormalizedYieldsPerSpinIteration =
+        Max(1, (int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+    s_previousNormalizationTimeMs = GetTickCount();
+    s_normalizationState = NormalizationState::Initialized;
+    s_isMeasurementScheduled = false;
+}
+
+#endif // !CROSSGEN_COMPILE
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+#ifndef CROSSGEN_COMPILE
+    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+    if (normalizationState == NormalizationState::Initialized)
     {
-        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+    }
+    else if (normalizationState == NormalizationState::Uninitialized)
+    {
+    }
+    else
+    {
+        _ASSERTE(normalizationState == NormalizationState::Failed);
+        return;
     }
 
-    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
-    s_isYieldProcessorNormalizedInitialized = true;
+    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+    if (s_isMeasurementScheduled || !g_fEEStarted)
+    {
+        return;
+    }
 
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+    s_isMeasurementScheduled = true;
+    FinalizerThread::EnableFinalization();
+#endif // !CROSSGEN_COMPILE
 }
 
-void EnsureYieldProcessorNormalizedInitialized()
+#ifndef CROSSGEN_COMPILE
+
+void YieldProcessorNormalization::FireMeasurementEvents()
 {
     CONTRACTL
     {
         NOTHROW;
         GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
+        MODE_ANY;
     }
     CONTRACTL_END;
 
-    if (!s_isYieldProcessorNormalizedInitialized)
+    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+    // recorded information, so try to enumerate the array with some care.
+    double establishedNsPerYield = UntornLoad(&s_establishedNsPerYield);
+    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
     {
-        InitializeYieldProcessorNormalized();
+        double nsPerYield = UntornLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        if (nsPerYield != 0) // the array may not be fully initialized yet
+        {
+            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+        }
+
+        if (++nextIndex >= NsPerYieldMeasurementCount)
+        {
+            nextIndex = 0;
+        }
     }
 }
+
+double YieldProcessorNormalization::UntornLoad(double *valueRef)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    return VolatileLoadWithoutBarrier(valueRef);
+#else
+    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
+}
+
+#endif // !CROSSGEN_COMPILE
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs
index e1c0766b3f0dfa..8e8198de392b05 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs
@@ -71,10 +71,6 @@ public static void Wait(int spinIndex, int sleep0Threshold, int processorCount)
                 // the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to
                 // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
                 // usually better for that.
-                //
-                // Thread.OptimalMaxSpinWaitsPerSpinIteration:
-                //   - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
-                //
                 int n = Thread.OptimalMaxSpinWaitsPerSpinIteration;
                 if (spinIndex <= 30 && (1 << spinIndex) < n)
                 {
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs
index b45cc7d5d3803a..66b73f8be02522 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs
@@ -225,10 +225,6 @@ private void SpinOnceCore(int sleep1Threshold)
                 // the equivalent of YieldProcessor(), as at that point SwitchToThread/Sleep(0) are more likely to be able to
                 // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
                 // usually better for that.
-                //
-                // Thread.OptimalMaxSpinWaitsPerSpinIteration:
-                //   - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
-                //
                 int n = Thread.OptimalMaxSpinWaitsPerSpinIteration;
                 if (_count <= 30 && (1 << _count) < n)
                 {

From 84fa9a29e2eb5f770d15610e4a3d30cd4256617b Mon Sep 17 00:00:00 2001
From: Koundinya Veluri <kouvel@microsoft.com>
Date: Wed, 7 Jul 2021 22:02:04 -0700
Subject: [PATCH 2/5] Fix the other half of the potential for torn memory
 accesses

---
 src/coreclr/inc/yieldprocessornormalized.h  |  3 ++-
 src/coreclr/vm/yieldprocessornormalized.cpp | 29 ++++++++++++++++-----
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index ca9e7b2ae3e982..121e60b033356d 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -57,7 +57,8 @@ class YieldProcessorNormalization
     static void FireMeasurementEvents();
 
 private:
-    static double UntornLoad(double *valueRef);
+    static double AtomicLoad(double *valueRef);
+    static void AtomicStore(double *valueRef, double value);
 
     DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization);
 
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 061d254a429f30..06214d85dc4414 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -130,7 +130,8 @@ void YieldProcessorNormalization::PerformMeasurement()
         }
 
         int nextMeasurementIndex = s_nextMeasurementIndex;
-        s_nsPerYieldMeasurements[nextMeasurementIndex] = latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
         if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
         {
             nextMeasurementIndex = 0;
@@ -152,10 +153,10 @@ void YieldProcessorNormalization::PerformMeasurement()
         for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
         {
             latestNsPerYield = MeasureNsPerYield(measureDurationUs);
-            s_nsPerYieldMeasurements[i] = latestNsPerYield;
+            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
             if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
             {
-                s_establishedNsPerYield = latestNsPerYield;
+                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
             }
 
             if (i < NsPerYieldMeasurementCount - 1)
@@ -179,7 +180,10 @@ void YieldProcessorNormalization::PerformMeasurement()
             establishedNsPerYield = nsPerYield;
         }
     }
-    s_establishedNsPerYield = establishedNsPerYield;
+    if (establishedNsPerYield != s_establishedNsPerYield)
+    {
+        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+    }
 
     FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
 
@@ -257,11 +261,11 @@ void YieldProcessorNormalization::FireMeasurementEvents()
 
     // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
     // recorded information, so try to enumerate the array with some care.
-    double establishedNsPerYield = UntornLoad(&s_establishedNsPerYield);
+    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
     int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
     for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
     {
-        double nsPerYield = UntornLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
         if (nsPerYield != 0) // the array may not be fully initialized yet
         {
             FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
@@ -274,7 +278,7 @@ void YieldProcessorNormalization::FireMeasurementEvents()
     }
 }
 
-double YieldProcessorNormalization::UntornLoad(double *valueRef)
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
 {
     WRAPPER_NO_CONTRACT;
 
@@ -285,4 +289,15 @@ double YieldProcessorNormalization::UntornLoad(double *valueRef)
 #endif
 }
 
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    *valueRef = value;
+#else
+    InterlockedExchangeT(valueRef, value);
+#endif
+}
+
 #endif // !CROSSGEN_COMPILE

From 6efe43fc271a0a165508bdbfdc1645ce4dfeed64 Mon Sep 17 00:00:00 2001
From: Koundinya Veluri <kouvel@microsoft.com>
Date: Thu, 8 Jul 2021 12:37:13 -0700
Subject: [PATCH 3/5] Address feedback

---
 src/coreclr/vm/eventtrace.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/vm/eventtrace.cpp b/src/coreclr/vm/eventtrace.cpp
index 05f5f7509c9791..321b3608b0b874 100644
--- a/src/coreclr/vm/eventtrace.cpp
+++ b/src/coreclr/vm/eventtrace.cpp
@@ -4417,6 +4417,12 @@ VOID EtwCallbackCommon(
     {
         ETW::TypeSystemLog::OnKeywordsChanged();
     }
+
+    if (g_fEEStarted && !g_fEEShutDown)
+    {
+        // Emit the YieldProcessor measured values at the beginning of the trace
+        YieldProcessorNormalization::FireMeasurementEvents();
+    }
 }
 
 // Individual callbacks for each EventPipe provider.
@@ -4680,12 +4686,6 @@ extern "C"
             {
                 ETW::EnumerationLog::EnumerateForCaptureState();
             }
-
-            if (g_fEEStarted && !g_fEEShutDown)
-            {
-                // Emit the YieldProcessor measured values at the beginning of the trace
-                YieldProcessorNormalization::FireMeasurementEvents();
-            }
         }
 #ifdef FEATURE_COMINTEROP
         if (ETW_EVENT_ENABLED(MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context, CCWRefCountChange))

From c81dd11e95dc247e40b7e09a186134a8d6df99ea Mon Sep 17 00:00:00 2001
From: Koundinya Veluri <kouvel@microsoft.com>
Date: Fri, 9 Jul 2021 11:36:38 -0700
Subject: [PATCH 4/5] Add check to see if event is enabled

---
 src/coreclr/vm/yieldprocessornormalized.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 06214d85dc4414..540aa57ec58f0e 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -259,6 +259,11 @@ void YieldProcessorNormalization::FireMeasurementEvents()
     }
     CONTRACTL_END;
 
+    if (!EventEnabledYieldProcessorMeasurement())
+    {
+        return;
+    }
+
     // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
     // recorded information, so try to enumerate the array with some care.
     double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);

From 2d1455cd086b343431e03beffe40212c5c99f2f1 Mon Sep 17 00:00:00 2001
From: Koundinya Veluri <kouvel@microsoft.com>
Date: Fri, 9 Jul 2021 18:54:36 -0700
Subject: [PATCH 5/5] Fix signed/unsigned mismatch

---
 src/coreclr/vm/yieldprocessornormalized.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 540aa57ec58f0e..2c51e73b678d8e 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -188,7 +188,7 @@ void YieldProcessorNormalization::PerformMeasurement()
     FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
 
     // Calculate the number of yields required to span the duration of a normalized yield
-    int yieldsPerNormalizedYield = Max(1, (int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
     _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
     s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
 
@@ -196,7 +196,7 @@ void YieldProcessorNormalization::PerformMeasurement()
     // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
     // better job of allowing other work to run.
     s_optimalMaxNormalizedYieldsPerSpinIteration =
-        Max(1, (int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
     _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);