From 1a17140fa76504fb107871ae644147032b761090 Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Wed, 9 Jun 2021 22:54:45 -0700 Subject: [PATCH 1/5] Update spin-wait pause/yield normalization - Modified the measurement to use much less time and to remeasure periodically to reduce CPU usage during startup - Each measurement does a low-microsecond-level measurement of pause/yield times - Some small amount of history of recent measurements is retained and used to for now take the lowest measurement for normalization - Measurements are done lazily, and at most every few seconds another measurement is taken - Added a profiling event that includes info about a measurement and the established value from recent measurements that is used for normalization --- .../src/System/Threading/Thread.CoreCLR.cs | 23 +- src/coreclr/inc/yieldprocessornormalized.h | 119 ++++++-- .../utilcode/yieldprocessornormalized.cpp | 16 +- src/coreclr/vm/CMakeLists.txt | 2 +- src/coreclr/vm/ClrEtwAll.man | 27 +- src/coreclr/vm/ClrEtwAllMeta.lst | 8 +- src/coreclr/vm/comsynchronizable.cpp | 17 +- src/coreclr/vm/comsynchronizable.h | 2 +- src/coreclr/vm/ecalllist.h | 2 +- src/coreclr/vm/eventtrace.cpp | 6 + src/coreclr/vm/finalizerthread.cpp | 5 - src/coreclr/vm/threads.cpp | 9 +- src/coreclr/vm/yieldprocessornormalized.cpp | 285 ++++++++++++++---- .../System/Threading/LowLevelSpinWaiter.cs | 4 - .../src/System/Threading/SpinWait.cs | 4 - 15 files changed, 391 insertions(+), 138 deletions(-) diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs index c1a968ed9f9948..83be00cf5e03f6 100644 --- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs +++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs @@ -322,33 +322,14 @@ public void DisableComObjectEagerCleanup() [MethodImpl(MethodImplOptions.InternalCall)] public extern bool Join(int millisecondsTimeout); - private static int s_optimalMaxSpinWaitsPerSpinIteration; - - [DllImport(RuntimeHelpers.QCall)] - private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal(); - /// /// Max value to be passed into for optimal delaying. This value is normalized to be /// appropriate for the processor. /// internal static int OptimalMaxSpinWaitsPerSpinIteration { - get - { - int optimalMaxSpinWaitsPerSpinIteration = s_optimalMaxSpinWaitsPerSpinIteration; - return optimalMaxSpinWaitsPerSpinIteration != 0 ? optimalMaxSpinWaitsPerSpinIteration : CalculateOptimalMaxSpinWaitsPerSpinIteration(); - } - } - - [MethodImpl(MethodImplOptions.NoInlining)] - private static int CalculateOptimalMaxSpinWaitsPerSpinIteration() - { - // This is done lazily because the first call to the function below in the process triggers a measurement that - // takes a nontrivial amount of time if the measurement has not already been done in the backgorund. - // See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value. - s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal(); - Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0); - return s_optimalMaxSpinWaitsPerSpinIteration; + [MethodImpl(MethodImplOptions.InternalCall)] + get; } [MethodImpl(MethodImplOptions.InternalCall)] diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index ba349bb83ad567..ca9e7b2ae3e982 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -12,14 +12,58 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } #endif #define YieldProcessor Dont_Use_YieldProcessor -const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake -const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake +#define DISABLE_COPY(T) \ + T(const T &) = delete; \ + T &operator =(const T &) = delete -extern unsigned int g_yieldsPerNormalizedYield; -extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration; +#define DISABLE_CONSTRUCT_COPY(T) \ + T() = delete; \ + DISABLE_COPY(T) -void InitializeYieldProcessorNormalizedCrst(); -void EnsureYieldProcessorNormalizedInitialized(); +class YieldProcessorNormalization +{ +public: + static const unsigned int TargetNsPerNormalizedYield = 37; + static const unsigned int TargetMaxNsPerSpinIteration = 272; + + // These are maximums for the computed values for normalization based their calculation + static const unsigned int MaxYieldsPerNormalizedYield = TargetNsPerNormalizedYield * 10; + static const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration = + TargetMaxNsPerSpinIteration * 3 / (TargetNsPerNormalizedYield * 2) + 1; + +private: + static bool s_isMeasurementScheduled; + + static unsigned int s_yieldsPerNormalizedYield; + static unsigned int s_optimalMaxNormalizedYieldsPerSpinIteration; + +public: + static bool IsMeasurementScheduled() + { + return s_isMeasurementScheduled; + } + + static void PerformMeasurement(); + +private: + static void ScheduleMeasurementIfNecessary(); + +public: + static unsigned int GetOptimalMaxNormalizedYieldsPerSpinIteration() + { + return s_optimalMaxNormalizedYieldsPerSpinIteration; + } + + static void FireMeasurementEvents(); + +private: + static double UntornLoad(double *valueRef); + + DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization); + + friend class YieldProcessorNormalizationInfo; + friend void YieldProcessorNormalizedForPreSkylakeCount(unsigned int); +}; class YieldProcessorNormalizationInfo { @@ -30,12 +74,15 @@ class YieldProcessorNormalizationInfo public: YieldProcessorNormalizationInfo() - : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield), - optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration), + : yieldsPerNormalizedYield(YieldProcessorNormalization::s_yieldsPerNormalizedYield), + optimalMaxNormalizedYieldsPerSpinIteration(YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration), optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration) { + YieldProcessorNormalization::ScheduleMeasurementIfNecessary(); } + DISABLE_COPY(YieldProcessorNormalizationInfo); + friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &); friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int); friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int); @@ -98,9 +145,8 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo if (sizeof(SIZE_T) <= sizeof(unsigned int)) { - // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield - // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). - const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield; + // On platforms with a small SIZE_T, prevent overflow on the multiply below + const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (count > MaxCount) { count = MaxCount; @@ -144,9 +190,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( if (sizeof(SIZE_T) <= sizeof(unsigned int)) { - // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield - // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). - const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield; + // On platforms with a small SIZE_T, prevent overflow on the multiply below + const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (preSkylakeCount > MaxCount) { preSkylakeCount = MaxCount; @@ -175,7 +220,35 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( // } FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount) { - YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount); + // This function does not forward to the one above because it is used by some code under utilcode, where + // YieldProcessorNormalizationInfo cannot be used since normalization does not happen in some of its consumers. So this + // version uses the fields in YieldProcessorNormalization directly. + + _ASSERTE(preSkylakeCount != 0); + + if (sizeof(SIZE_T) <= sizeof(unsigned int)) + { + // On platforms with a small SIZE_T, prevent overflow on the multiply below + const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; + if (preSkylakeCount > MaxCount) + { + preSkylakeCount = MaxCount; + } + } + + const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; + SIZE_T n = + (SIZE_T)preSkylakeCount * + YieldProcessorNormalization::s_yieldsPerNormalizedYield / + PreSkylakeCountToSkylakeCountDivisor; + if (n == 0) + { + n = 1; + } + do + { + System_YieldProcessor(); + } while (--n != 0); } // See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the @@ -193,15 +266,12 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized( const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int spinIteration) { - // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in - // InitializeYieldProcessorNormalized() - const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration = - NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1; - _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); - - // This shift value should be adjusted based on the asserted condition below + // This shift value should be adjusted based on the asserted conditions below const UINT8 MaxShift = 3; - static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + static_assert_no_msg( + ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration); + static_assert_no_msg( + ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration); unsigned int n; if (spinIteration <= MaxShift && @@ -219,3 +289,6 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized( System_YieldProcessor(); } while (--n != 0); } + +#undef DISABLE_CONSTRUCT_COPY +#undef DISABLE_COPY diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp index 4242f82792b47d..020d8d7cc79e4e 100644 --- a/src/coreclr/utilcode/yieldprocessornormalized.cpp +++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp @@ -2,8 +2,16 @@ // The .NET Foundation licenses this file to you under the MIT license. #include "stdafx.h" +#include "yieldprocessornormalized.h" -// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are -// tuned for Skylake processors -unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake -unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7; +bool YieldProcessorNormalization::s_isMeasurementScheduled; + +// Defaults are for when normalization has not yet been done +unsigned int YieldProcessorNormalization::s_yieldsPerNormalizedYield = 1; +unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration = + (unsigned int) + ( + (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration / + YieldProcessorNormalization::TargetNsPerNormalizedYield + + 0.5 + ); diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt index 1d682d2a428bbf..f16005660dc345 100644 --- a/src/coreclr/vm/CMakeLists.txt +++ b/src/coreclr/vm/CMakeLists.txt @@ -136,7 +136,6 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON versionresilienthashcode.cpp virtualcallstub.cpp win32threadpool.cpp - yieldprocessornormalized.cpp zapsig.cpp ) @@ -389,6 +388,7 @@ set(VM_SOURCES_WKS threadsuspend.cpp typeparse.cpp weakreferencenative.cpp + yieldprocessornormalized.cpp ${VM_SOURCES_GDBJIT} ) diff --git a/src/coreclr/vm/ClrEtwAll.man b/src/coreclr/vm/ClrEtwAll.man index d8a275c6da6295..0ffca6cb3cb6ba 100644 --- a/src/coreclr/vm/ClrEtwAll.man +++ b/src/coreclr/vm/ClrEtwAll.man @@ -438,7 +438,13 @@ - + + + + + @@ -2916,6 +2922,19 @@ + + @@ -3313,6 +3332,10 @@ keywords ="ThreadingKeyword" opcode="Wait" task="ThreadPoolWorkerThread" symbol="ThreadPoolWorkerThreadWait" message="$(string.RuntimePublisher.ThreadPoolWorkerThreadEventMessage)"/> + + + @@ -7410,6 +7434,7 @@ + diff --git a/src/coreclr/vm/ClrEtwAllMeta.lst b/src/coreclr/vm/ClrEtwAllMeta.lst index 285e9101c6321d..d565e015a8b050 100644 --- a/src/coreclr/vm/ClrEtwAllMeta.lst +++ b/src/coreclr/vm/ClrEtwAllMeta.lst @@ -134,9 +134,9 @@ nomac:GarbageCollection:::GCJoin_V2 nostack:Type:::BulkType -################### -# Threadpool events -################### +################################# +# Threading and Threadpool events +################################# nomac:WorkerThreadCreation:::WorkerThreadCreate noclrinstanceid:WorkerThreadCreation:::WorkerThreadCreate nomac:WorkerThreadCreation:::WorkerThreadTerminate @@ -170,6 +170,8 @@ nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment +nomac:YieldProcessorMeasurement:::YieldProcessorMeasurement +nostack:YieldProcessorMeasurement:::YieldProcessorMeasurement ################## # Exception events diff --git a/src/coreclr/vm/comsynchronizable.cpp b/src/coreclr/vm/comsynchronizable.cpp index 39f00d06741933..15a33c711e7a95 100644 --- a/src/coreclr/vm/comsynchronizable.cpp +++ b/src/coreclr/vm/comsynchronizable.cpp @@ -1089,22 +1089,13 @@ FCIMPL1(void, ThreadNative::SetIsThreadpoolThread, ThreadBaseObject* thread) } FCIMPLEND -INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration() +FCIMPL0(INT32, ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration) { - QCALL_CONTRACT; - - INT32 optimalMaxNormalizedYieldsPerSpinIteration; - - BEGIN_QCALL; - - // RuntimeThread calls this function only once lazily and caches the result, so ensure initialization - EnsureYieldProcessorNormalizedInitialized(); - optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration; - - END_QCALL; + FCALL_CONTRACT; - return optimalMaxNormalizedYieldsPerSpinIteration; + return (INT32)YieldProcessorNormalization::GetOptimalMaxNormalizedYieldsPerSpinIteration(); } +FCIMPLEND FCIMPL1(void, ThreadNative::SpinWait, int iterations) { diff --git a/src/coreclr/vm/comsynchronizable.h b/src/coreclr/vm/comsynchronizable.h index e9968201b8bc20..cfab18d9010706 100644 --- a/src/coreclr/vm/comsynchronizable.h +++ b/src/coreclr/vm/comsynchronizable.h @@ -86,7 +86,7 @@ friend class ThreadBaseObject; UINT64 QCALLTYPE GetProcessDefaultStackSize(); static FCDECL1(INT32, GetManagedThreadId, ThreadBaseObject* th); - static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration(); + static FCDECL0(INT32, GetOptimalMaxSpinWaitsPerSpinIteration); static FCDECL1(void, SpinWait, int iterations); static BOOL QCALLTYPE YieldThread(); static FCDECL0(Object*, GetCurrentThread); diff --git a/src/coreclr/vm/ecalllist.h b/src/coreclr/vm/ecalllist.h index f77dc75c80b5cd..ea3f65d72917d2 100644 --- a/src/coreclr/vm/ecalllist.h +++ b/src/coreclr/vm/ecalllist.h @@ -602,7 +602,7 @@ FCFuncStart(gThreadFuncs) #endif // FEATURE_COMINTEROP FCFuncElement("Interrupt", ThreadNative::Interrupt) FCFuncElement("Join", ThreadNative::Join) - QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration) + FCFuncElement("get_OptimalMaxSpinWaitsPerSpinIteration", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration) FCFuncElement("GetCurrentProcessorNumber", ThreadNative::GetCurrentProcessorNumber) FCFuncEnd() diff --git a/src/coreclr/vm/eventtrace.cpp b/src/coreclr/vm/eventtrace.cpp index 14bd22538d2b58..05f5f7509c9791 100644 --- a/src/coreclr/vm/eventtrace.cpp +++ b/src/coreclr/vm/eventtrace.cpp @@ -4680,6 +4680,12 @@ extern "C" { ETW::EnumerationLog::EnumerateForCaptureState(); } + + if (g_fEEStarted && !g_fEEShutDown) + { + // Emit the YieldProcessor measured values at the beginning of the trace + YieldProcessorNormalization::FireMeasurementEvents(); + } } #ifdef FEATURE_COMINTEROP if (ETW_EVENT_ENABLED(MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context, CCWRefCountChange)) diff --git a/src/coreclr/vm/finalizerthread.cpp b/src/coreclr/vm/finalizerthread.cpp index 1e4dbf913c898e..e8370315e66651 100644 --- a/src/coreclr/vm/finalizerthread.cpp +++ b/src/coreclr/vm/finalizerthread.cpp @@ -379,11 +379,6 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args) { GetFinalizerThread()->SetBackground(TRUE); - { - GCX_PREEMP(); - EnsureYieldProcessorNormalizedInitialized(); - } - while (!fQuitFinalizer) { // This will apply any policy for swallowing exceptions during normal diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp index fa93110399d397..01ae12d14d3b8c 100644 --- a/src/coreclr/vm/threads.cpp +++ b/src/coreclr/vm/threads.cpp @@ -1131,8 +1131,6 @@ void InitThreadManager() } CONTRACTL_END; - InitializeYieldProcessorNormalizedCrst(); - // All patched helpers should fit into one page. // If you hit this assert on retail build, there is most likely problem with BBT script. _ASSERTE_ALL_BUILDS("clr/src/VM/threads.cpp", (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart > (ptrdiff_t)0); @@ -7145,6 +7143,7 @@ BOOL Thread::HaveExtraWorkForFinalizer() || Thread::CleanupNeededForFinalizedThread() || (m_DetachCount > 0) || SystemDomain::System()->RequireAppDomainCleanup() + || YieldProcessorNormalization::IsMeasurementScheduled() || ThreadStore::s_pThreadStore->ShouldTriggerGCForDeadThreads(); } @@ -7191,6 +7190,12 @@ void Thread::DoExtraWorkForFinalizer() // If there were any TimerInfos waiting to be released, they'll get flushed now ThreadpoolMgr::FlushQueueOfTimerInfos(); + if (YieldProcessorNormalization::IsMeasurementScheduled()) + { + GCX_PREEMP(); + YieldProcessorNormalization::PerformMeasurement(); + } + ThreadStore::s_pThreadStore->TriggerGCForDeadThreadsIfNecessary(); } diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 91547923310fbd..061d254a429f30 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -2,17 +2,33 @@ // The .NET Foundation licenses this file to you under the MIT license. #include "common.h" +#include "yieldprocessornormalized.h" -static Volatile s_isYieldProcessorNormalizedInitialized = false; -static CrstStatic s_initializeYieldProcessorNormalizedCrst; +#ifndef CROSSGEN_COMPILE -void InitializeYieldProcessorNormalizedCrst() +#include "finalizerthread.h" + +enum class NormalizationState : UINT8 { - WRAPPER_NO_CONTRACT; - s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock); -} + Uninitialized, + Initialized, + Failed +}; + +static const int NsPerYieldMeasurementCount = 8; +static const unsigned int MeasurementPeriodMs = 4000; + +static const unsigned int NsPerS = 1000 * 1000 * 1000; + +static NormalizationState s_normalizationState = NormalizationState::Uninitialized; +static unsigned int s_previousNormalizationTimeMs; + +static UINT64 s_performanceCounterTicksPerS; +static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; +static int s_nextMeasurementIndex; +static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; -static void InitializeYieldProcessorNormalized() +static unsigned int DetermineMeasureDurationUs() { CONTRACTL { @@ -22,92 +38,251 @@ static void InitializeYieldProcessorNormalized() } CONTRACTL_END; - CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); + _ASSERTE(s_normalizationState != NormalizationState::Failed); - if (s_isYieldProcessorNormalizedInitialized) + // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration + // if the overhead seems high relative to the measure duration. + unsigned int measureDurationUs = 1; + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + UINT64 startTicks = li.QuadPart; + QueryPerformanceCounter(&li); + UINT64 elapsedTicks = li.QuadPart - startTicks; + if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration { - return; + measureDurationUs *= 4; } + return measureDurationUs; +} - // Intel pre-Skylake processor: measured typically 14-17 cycles per yield - // Intel post-Skylake processor: measured typically 125-150 cycles per yield - const int MeasureDurationMs = 10; - const int NsPerSecond = 1000 * 1000 * 1000; +static double MeasureNsPerYield(unsigned int measureDurationUs) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_PREEMPTIVE; + } + CONTRACTL_END; + + _ASSERTE(s_normalizationState != NormalizationState::Failed); + + int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1; + UINT64 ticksPerS = s_performanceCounterTicksPerS; + UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); LARGE_INTEGER li; - if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs) + QueryPerformanceCounter(&li); + UINT64 startTicks = li.QuadPart; + + for (int i = 0; i < yieldCount; ++i) { - // High precision clock not available or clock resolution is too low, resort to defaults - s_isYieldProcessorNormalizedInitialized = true; - return; + System_YieldProcessor(); } - ULONGLONG ticksPerSecond = li.QuadPart; - // Measure the nanosecond delay per yield - ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs); - unsigned int yieldCount = 0; QueryPerformanceCounter(&li); - ULONGLONG startTicks = li.QuadPart; - ULONGLONG elapsedTicks; - do - { - // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask - // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the - // low microsecond range. - for (int i = 0; i < 1000; ++i) + UINT64 elapsedTicks = li.QuadPart - startTicks; + while (elapsedTicks < measureDurationTicks) + { + int nextYieldCount = + Max(4, + elapsedTicks == 0 + ? yieldCount / 4 + : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1); + for (int i = 0; i < nextYieldCount; ++i) { System_YieldProcessor(); } - yieldCount += 1000; QueryPerformanceCounter(&li); - ULONGLONG nowTicks = li.QuadPart; - elapsedTicks = nowTicks - startTicks; - } while (elapsedTicks < measureDurationTicks); - double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond); - if (nsPerYield < 1) + elapsedTicks = li.QuadPart - startTicks; + yieldCount += nextYieldCount; + } + + // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op + const double MinNsPerYield = 0.1; + + // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to + // really take this long. Limit the maximum to keep the recorded values reasonable. + const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1; + + return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); +} + +void YieldProcessorNormalization::PerformMeasurement() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_PREEMPTIVE; + } + CONTRACTL_END; + + _ASSERTE(s_isMeasurementScheduled); + + double latestNsPerYield; + if (s_normalizationState == NormalizationState::Initialized) + { + if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + { + return; + } + + int nextMeasurementIndex = s_nextMeasurementIndex; + s_nsPerYieldMeasurements[nextMeasurementIndex] = latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); + if (++nextMeasurementIndex >= NsPerYieldMeasurementCount) + { + nextMeasurementIndex = 0; + } + s_nextMeasurementIndex = nextMeasurementIndex; + } + else if (s_normalizationState == NormalizationState::Uninitialized) + { + LARGE_INTEGER li; + if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000) + { + // High precision clock not available or clock resolution is too low, resort to defaults + s_normalizationState = NormalizationState::Failed; + return; + } + s_performanceCounterTicksPerS = li.QuadPart; + + unsigned int measureDurationUs = DetermineMeasureDurationUs(); + for (int i = 0; i < NsPerYieldMeasurementCount; ++i) + { + latestNsPerYield = MeasureNsPerYield(measureDurationUs); + s_nsPerYieldMeasurements[i] = latestNsPerYield; + if (i == 0 || latestNsPerYield < s_establishedNsPerYield) + { + s_establishedNsPerYield = latestNsPerYield; + } + + if (i < NsPerYieldMeasurementCount - 1) + { + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); + } + } + } + else { - nsPerYield = 1; + _ASSERTE(s_normalizationState == NormalizationState::Failed); + return; } - // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this - // value is naturally limited to MinNsPerNormalizedYield. - int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5); - if (yieldsPerNormalizedYield < 1) + double establishedNsPerYield = s_nsPerYieldMeasurements[0]; + for (int i = 1; i < NsPerYieldMeasurementCount; ++i) { - yieldsPerNormalizedYield = 1; + double nsPerYield = s_nsPerYieldMeasurements[i]; + if (nsPerYield < establishedNsPerYield) + { + establishedNsPerYield = nsPerYield; + } } - _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield); + s_establishedNsPerYield = establishedNsPerYield; + + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); + + // Calculate the number of yields required to span the duration of a normalized yield + int yieldsPerNormalizedYield = Max(1, (int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); + _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); + s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a // better job of allowing other work to run. - int optimalMaxNormalizedYieldsPerSpinIteration = - (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5); - if (optimalMaxNormalizedYieldsPerSpinIteration < 1) + s_optimalMaxNormalizedYieldsPerSpinIteration = + Max(1, (int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); + _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + + GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); + + s_previousNormalizationTimeMs = GetTickCount(); + s_normalizationState = NormalizationState::Initialized; + s_isMeasurementScheduled = false; +} + +#endif // !CROSSGEN_COMPILE + +void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_ANY; + } + CONTRACTL_END; + +#ifndef CROSSGEN_COMPILE + NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState); + if (normalizationState == NormalizationState::Initialized) { - optimalMaxNormalizedYieldsPerSpinIteration = 1; + if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + { + return; + } + } + else if (normalizationState == NormalizationState::Uninitialized) + { + } + else + { + _ASSERTE(normalizationState == NormalizationState::Failed); + return; } - g_yieldsPerNormalizedYield = yieldsPerNormalizedYield; - g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; - s_isYieldProcessorNormalizedInitialized = true; + // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below + if (s_isMeasurementScheduled || !g_fEEStarted) + { + return; + } - GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); + s_isMeasurementScheduled = true; + FinalizerThread::EnableFinalization(); +#endif // !CROSSGEN_COMPILE } -void EnsureYieldProcessorNormalizedInitialized() +#ifndef CROSSGEN_COMPILE + +void YieldProcessorNormalization::FireMeasurementEvents() { CONTRACTL { NOTHROW; GC_NOTRIGGER; - MODE_PREEMPTIVE; + MODE_ANY; } CONTRACTL_END; - if (!s_isYieldProcessorNormalizedInitialized) + // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the + // recorded information, so try to enumerate the array with some care. + double establishedNsPerYield = UntornLoad(&s_establishedNsPerYield); + int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex); + for (int i = 0; i < NsPerYieldMeasurementCount; ++i) { - InitializeYieldProcessorNormalized(); + double nsPerYield = UntornLoad(&s_nsPerYieldMeasurements[nextIndex]); + if (nsPerYield != 0) // the array may not be fully initialized yet + { + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield); + } + + if (++nextIndex >= NsPerYieldMeasurementCount) + { + nextIndex = 0; + } } } + +double YieldProcessorNormalization::UntornLoad(double *valueRef) +{ + WRAPPER_NO_CONTRACT; + +#ifdef TARGET_64BIT + return VolatileLoadWithoutBarrier(valueRef); +#else + return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); +#endif +} + +#endif // !CROSSGEN_COMPILE diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs index e1c0766b3f0dfa..8e8198de392b05 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs @@ -71,10 +71,6 @@ public static void Wait(int spinIndex, int sleep0Threshold, int processorCount) // the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is // usually better for that. - // - // Thread.OptimalMaxSpinWaitsPerSpinIteration: - // - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value. - // int n = Thread.OptimalMaxSpinWaitsPerSpinIteration; if (spinIndex <= 30 && (1 << spinIndex) < n) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs index b45cc7d5d3803a..66b73f8be02522 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs @@ -225,10 +225,6 @@ private void SpinOnceCore(int sleep1Threshold) // the equivalent of YieldProcessor(), as at that point SwitchToThread/Sleep(0) are more likely to be able to // allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is // usually better for that. - // - // Thread.OptimalMaxSpinWaitsPerSpinIteration: - // - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value. - // int n = Thread.OptimalMaxSpinWaitsPerSpinIteration; if (_count <= 30 && (1 << _count) < n) { From 84fa9a29e2eb5f770d15610e4a3d30cd4256617b Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Wed, 7 Jul 2021 22:02:04 -0700 Subject: [PATCH 2/5] Fix the other half of the potential for torn memory accesses --- src/coreclr/inc/yieldprocessornormalized.h | 3 ++- src/coreclr/vm/yieldprocessornormalized.cpp | 29 ++++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index ca9e7b2ae3e982..121e60b033356d 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -57,7 +57,8 @@ class YieldProcessorNormalization static void FireMeasurementEvents(); private: - static double UntornLoad(double *valueRef); + static double AtomicLoad(double *valueRef); + static void AtomicStore(double *valueRef, double value); DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization); diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 061d254a429f30..06214d85dc4414 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -130,7 +130,8 @@ void YieldProcessorNormalization::PerformMeasurement() } int nextMeasurementIndex = s_nextMeasurementIndex; - s_nsPerYieldMeasurements[nextMeasurementIndex] = latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); + latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); + AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield); if (++nextMeasurementIndex >= NsPerYieldMeasurementCount) { nextMeasurementIndex = 0; @@ -152,10 +153,10 @@ void YieldProcessorNormalization::PerformMeasurement() for (int i = 0; i < NsPerYieldMeasurementCount; ++i) { latestNsPerYield = MeasureNsPerYield(measureDurationUs); - s_nsPerYieldMeasurements[i] = latestNsPerYield; + AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield); if (i == 0 || latestNsPerYield < s_establishedNsPerYield) { - s_establishedNsPerYield = latestNsPerYield; + AtomicStore(&s_establishedNsPerYield, latestNsPerYield); } if (i < NsPerYieldMeasurementCount - 1) @@ -179,7 +180,10 @@ void YieldProcessorNormalization::PerformMeasurement() establishedNsPerYield = nsPerYield; } } - s_establishedNsPerYield = establishedNsPerYield; + if (establishedNsPerYield != s_establishedNsPerYield) + { + AtomicStore(&s_establishedNsPerYield, establishedNsPerYield); + } FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); @@ -257,11 +261,11 @@ void YieldProcessorNormalization::FireMeasurementEvents() // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the // recorded information, so try to enumerate the array with some care. - double establishedNsPerYield = UntornLoad(&s_establishedNsPerYield); + double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield); int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex); for (int i = 0; i < NsPerYieldMeasurementCount; ++i) { - double nsPerYield = UntornLoad(&s_nsPerYieldMeasurements[nextIndex]); + double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]); if (nsPerYield != 0) // the array may not be fully initialized yet { FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield); @@ -274,7 +278,7 @@ void YieldProcessorNormalization::FireMeasurementEvents() } } -double YieldProcessorNormalization::UntornLoad(double *valueRef) +double YieldProcessorNormalization::AtomicLoad(double *valueRef) { WRAPPER_NO_CONTRACT; @@ -285,4 +289,15 @@ double YieldProcessorNormalization::UntornLoad(double *valueRef) #endif } +void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) +{ + WRAPPER_NO_CONTRACT; + +#ifdef TARGET_64BIT + *valueRef = value; +#else + InterlockedExchangeT(valueRef, value); +#endif +} + #endif // !CROSSGEN_COMPILE From 6efe43fc271a0a165508bdbfdc1645ce4dfeed64 Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Thu, 8 Jul 2021 12:37:13 -0700 Subject: [PATCH 3/5] Address feedback --- src/coreclr/vm/eventtrace.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/vm/eventtrace.cpp b/src/coreclr/vm/eventtrace.cpp index 05f5f7509c9791..321b3608b0b874 100644 --- a/src/coreclr/vm/eventtrace.cpp +++ b/src/coreclr/vm/eventtrace.cpp @@ -4417,6 +4417,12 @@ VOID EtwCallbackCommon( { ETW::TypeSystemLog::OnKeywordsChanged(); } + + if (g_fEEStarted && !g_fEEShutDown) + { + // Emit the YieldProcessor measured values at the beginning of the trace + YieldProcessorNormalization::FireMeasurementEvents(); + } } // Individual callbacks for each EventPipe provider. @@ -4680,12 +4686,6 @@ extern "C" { ETW::EnumerationLog::EnumerateForCaptureState(); } - - if (g_fEEStarted && !g_fEEShutDown) - { - // Emit the YieldProcessor measured values at the beginning of the trace - YieldProcessorNormalization::FireMeasurementEvents(); - } } #ifdef FEATURE_COMINTEROP if (ETW_EVENT_ENABLED(MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context, CCWRefCountChange)) From c81dd11e95dc247e40b7e09a186134a8d6df99ea Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Fri, 9 Jul 2021 11:36:38 -0700 Subject: [PATCH 4/5] Add check to see if event is enabled --- src/coreclr/vm/yieldprocessornormalized.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 06214d85dc4414..540aa57ec58f0e 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -259,6 +259,11 @@ void YieldProcessorNormalization::FireMeasurementEvents() } CONTRACTL_END; + if (!EventEnabledYieldProcessorMeasurement()) + { + return; + } + // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the // recorded information, so try to enumerate the array with some care. double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield); From 2d1455cd086b343431e03beffe40212c5c99f2f1 Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Fri, 9 Jul 2021 18:54:36 -0700 Subject: [PATCH 5/5] Fix signed/unsigned mismatch --- src/coreclr/vm/yieldprocessornormalized.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 540aa57ec58f0e..2c51e73b678d8e 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -188,7 +188,7 @@ void YieldProcessorNormalization::PerformMeasurement() FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); // Calculate the number of yields required to span the duration of a normalized yield - int yieldsPerNormalizedYield = Max(1, (int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); + unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; @@ -196,7 +196,7 @@ void YieldProcessorNormalization::PerformMeasurement() // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a // better job of allowing other work to run. s_optimalMaxNormalizedYieldsPerSpinIteration = - Max(1, (int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); + Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);