When is Spinlock a Significant Driver of CPU utilization?
SQL Server uses multiple mechanisms for synchronizing access to shared structures. One such mechanism is a spinlock. Briefly, a spinlock is a type of lock acquisition mechanism which is designed to avoid context switching . In essence, instead of going to sleep (Sleep()) or waiting to be signaled (WaitForSingleObject()), a thread performs "idle" loops on the CPU and periodically checks if the lock becomes available so it can acquire it. Think of it as "Spin until I can get the lock". The goal is to avoid thread Context switching which is an expensiveness operation from an OS perspective. Spinlocks use an optimistic approach assuming that it won't take "too many" loops or spins until the lock is available. Spinlocks can become a problem if this assumption is not true, that is, an excessive number of spins are performed leading to high CPU usage.
For more details about Spinlocks, their implementation and t-shooting, please review the following white paper: Diagnosing and Resolving Spinlock Contention on SQL Server. Here is a short description from the paper:
"Spinlocks are lightweight synchronization primitives which are used to protect access to data structures. Spinlocks are not unique to SQL Server. They are generally used when it is expected that access to a given data structure will need to be held for a very short period of time. When a thread attempting to acquire a spinlock is unable to obtain access it executes in a loop periodically checking to determine if the resource is available instead of immediately yielding. After some period of time a thread waiting on a spinlock will yield before it is able to acquire the resource in order to allow other threads running on the same CPU to execute. This is known as a backoff and will be discussed in more depth later in this paper."
The question this post will attempt to address is "How many spins is too many?" or "When do we consider spinlocks to be a significant driver of CPU usage"
The key to answering this is examining "Spins per second per CPU". If this number exceeds approximately 20000 spins per millisecond on a single CPU, then the spinlock has reached a level where it impacts CPU usage significantly. How do I arrive at this number?
Spinlocks are a little more than regular for/while loops, so if you measure approximately how many tight loops a CPU can perform in a unit of time - say millisecond - then you would know what is a reasonable threshold to compare against.
The following sample C++ application accomplishes just that on a single CPU (it introduces some pause similar to spinlock code) (get entire solution at GitHub )
The following sample C++ application accomplishes just that on a single CPU (it introduces some pause similar to spinlock code) (get entire solution at GitHub )
#include
#include
#include
#include
#include
//make this global for easy access
LONG volatile lockValue = 1;
const UINT64 MIN_SPIN = 250;
const UINT64 MAX_SPIN = 1000000;
UINT64 globalSpins = 0;
BOOL GetLock(
__in LONG Id) // I Thread id and additional information
{
assert(Id != 0);
UINT64 oldId = InterlockedCompareExchangeAcquire(&lockValue, Id, 0);
return oldId == 0;
}
void ReleaseLock( void *)
{
//sleep an arbitrarily hard-coded time
Sleep(5000);
//set the value to 0 so the lock is released and spinlock acquires the lock
lockValue = 0;
}
void
SpinToAcquireLockWithExponentialBackoff(__in DWORD Id, __out UINT32 * BackoffCount)
{
UINT32 Backoffs = 0;
UINT64 spinLimit = MIN_SPIN;
UINT64 iSpin = 0;
while (true)
{
assert(spinLimit <= MAX_SPIN && spinLimit >= MIN_SPIN);
// Spin for a while without reading the lock.
//
for (iSpin = 0; iSpin < spinLimit; iSpin++)
{
_mm_pause();
}
//printf_s("iSpin = %ld\n", iSpin);
//increment the global spins
globalSpins = globalSpins + iSpin;
//printf_s("globalSpins = %ld\n", globalSpins);
// Try to get the lock.
if (GetLock(Id))
{
break;
}
//
spinLimit = min(MAX_SPIN, spinLimit * 2);
// See if we need to backoff
//
Backoffs++;
}
//output the final backoffs
*BackoffCount = Backoffs;
} // SpinToAcquireLockWithExponentialBackoff
void ExerciseSpinLockCode()
{
UINT32 backoffCount = 0;
LARGE_INTEGER beforeHighRes, afterHighRes, elapsedMicroseconds, frequency;
//spawn a thread to simulate lock release
HANDLE hThread = (HANDLE)_beginthread(ReleaseLock, 0, NULL);
//get start times
QueryPerformanceCounter(&beforeHighRes);
long int before = GetTickCount();
//invoke Spinlock code
SpinToAcquireLockWithExponentialBackoff(GetCurrentThreadId(), &backoffCount);
//get the end times
long int after = GetTickCount();
QueryPerformanceCounter(&afterHighRes);
QueryPerformanceFrequency(&frequency);
//calculate microseconds
//ticks divided by ticks-per-second (frequency) , gives us seconds
//converted to microseconds by multiplying to 1 mil
elapsedMicroseconds.QuadPart = (afterHighRes.QuadPart - beforeHighRes.QuadPart);
elapsedMicroseconds.QuadPart *= 1000000;
elapsedMicroseconds.QuadPart /= frequency.QuadPart;
UINT64 SpinsPerMillisecond = (globalSpins) / (after - before);
printf_s("SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = %ld, Spins=%I64d, Backoffs=%ld\n", after - before, globalSpins, backoffCount);
printf_s("SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=%I64d\n", SpinsPerMillisecond);
printf_s("SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=%I64d\n", globalSpins/(elapsedMicroseconds.QuadPart/1000));
//wait on the thread completion
WaitForSingleObject(hThread, INFINITE);
}
void ExerciseSimpleLoopCode()
{
long i=0;
LARGE_INTEGER beforeHighRes, afterHighRes, elapsedMicroseconds, frequency;
QueryPerformanceCounter(&beforeHighRes);
//loop, no pause
for (i; i < MAX_SPIN*10; i++)
{
;
}
QueryPerformanceCounter(&afterHighRes);
QueryPerformanceFrequency(&frequency);
//https://docs.microsoft.com/en-us/windows/desktop/SysInfo/acquiring-high-resolution-time-stamps
elapsedMicroseconds.QuadPart = (afterHighRes.QuadPart - beforeHighRes.QuadPart);
elapsedMicroseconds.QuadPart *= 1000000;
elapsedMicroseconds.QuadPart /= frequency.QuadPart;
long elapsedTimeQPC = elapsedMicroseconds.QuadPart / 1000 == 0 ? 1 : (elapsedMicroseconds.QuadPart / 1000);
printf_s("Simple Loop: Millisecond elapsed (QPC)=%lld, Loops=%d\n", (elapsedMicroseconds.QuadPart / 1000), i);
printf_s("Simple Loop: Spins/Millisecond (QPC)=%ld\n", i / elapsedTimeQPC);
}
int main()
{
ExerciseSpinLockCode();
ExerciseSimpleLoopCode();
printf_s("Press ENTER to end...");
_gettchar();
return 0;
}
Test results (Single-Threaded Spinlock Simulator)
To invoke the application, go to a command prompt and chang directory to the folder where you have downloaded or compiled \Release folder. Then do this:
SpinlockSimulatorSingleThread.exe
Here are the sample results from multiple different machines with different CPU models/speeds
Sample Results 1 (Intel i7 2.7 Ghz)
=======================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5031, Spins=102023750, Backoffs=112
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=20279
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=20315
Simple Loop: Millisecond elapsed (QPC)=4, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=2500000
Press ENTER to end...
Sample Results 2 (AMD A6-6310 APU 1.8 Ghz)
===============================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5016, Spins=187023750, Backoffs=197
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=37285
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=37292
Simple Loop: Millisecond elapsed (QPC)=11, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=909090
Press ENTER to end...
Sample Results 3 (Intel Q6600 2.4 Ghz)
===========================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5015, Spins=916023750, Backoffs=926
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=182656
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=183058
Simple Loop: Millisecond elapsed (QPC)=4, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=2500000
Press ENTER to end...
Sample Results 4 (Intel Xeon E5-2620 v4 2.10 Ghz)
==================================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5016, Spins=81123750, Backoffs=821
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=161687
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=161977
Simple Loop: Millisecond elapsed (QPC)=4, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=2500000
Press ENTER to end...
Since the above example is designed to perform a little more than a loop with periodic pauses until the lock is acquired, it would drive CPU utilization to 100% on a single CPU.
Based on these values one can conclude that if spinlock statistics show values greater than about 20000 spins per millisecond per CPU, then the CPU usage is likely fairly high and close to 100% capacity.
Naturally, CPU clock speeds would impact the results and faster CPUs would be able to perform more spins per millisecond. Other CPU-related specifications like L1, L2, L3 caches may also play somewhat of a role here. However, the goal of this post is to provide guidance, a "ballpark" figure so to speak. Please feel free to report your test results in the Comments section.
Multi-Threaded SpinlockSimulator
There is a multi-threaded project in the same Github repository SpinlockSimulatorMultiThread. That is you can run it on multiple threads/CPUs in parallel and threads compete for a single spinlock. However, the key is to test CPU and its performance is to use SINGLE processor. Therefore, use for most scenarios you would benefit from using the SpinlockSimulatorSingleThread project from the repro. A single processor test is the best approach because the multi-threaded (multi-CPU) introduces the lock waits and thus spins will vary between threads due to how long they waited on a spinlock. But if you only look at the Simple Loop output, there the loops are consistently the same and thus the spins/millisecond would be the same on all threads. Or if you take into cosideration the spins, the calculations will make sense. In some scenarios, you may benefit from testing with the multi-threaded application. The way to use the multi-threaded app by is by calling the executable from command prompt. Without parameters it will default to 4 threads. If you pass a single-digit parameter, you can tell the app to run 1 or more threads:
SpinlockSimulatorMultiThread.exe 8
Sample output may look like this:
Launching 8 thread(s)...
Thread 28700 started.
Thread 28700 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 6756 started.
Thread 22472 started.
Thread 30632 started.
Thread 22176 started.
Thread 6552 started.
Thread 32968 started.
Thread 15996 started.
Thread 28700: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 3016, Spins=250, Backoffs=0
Thread 28700: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=0
Thread 28700: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=0
Thread 6552 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 28700: Simple Loop - Millisecond elapsed (QPC)=8, Loops=10000000
Thread 28700: Simple Loop - Spins/Millisecond (QPC)=1250000
Thread 28700 completed. Local Spins: 250
Thread 6552: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 7078, Spins=49023750, Backoffs=59
Thread 6552: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=6926
Thread 6552: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=6920
Thread 6756 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 6552: Simple Loop - Millisecond elapsed (QPC)=3, Loops=10000000
Thread 6552: Simple Loop - Spins/Millisecond (QPC)=3333333
Thread 6552 completed. Local Spins: 49023750
Thread 6756: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 10125, Spins=114023750, Backoffs=124
Thread 6756: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=11261
Thread 22472 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 6756: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=11274
Thread 6756: Simple Loop - Millisecond elapsed (QPC)=3, Loops=10000000
Thread 6756: Simple Loop - Spins/Millisecond (QPC)=3333333
Thread 6756 completed. Local Spins: 114023750
Thread 22472: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 13157, Spins=163023750, Backoffs=173
Thread 22472: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=12390
Thread 22472: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=12392
Thread 22472: Simple Loop - Millisecond elapsed (QPC)=4, Loops=10000000
Thread 22472: Simple Loop - Spins/Millisecond (QPC)=2500000
Thread 22472 completed. Local Spins: 163023750
Thread 32968 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 32968: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 16156, Spins=215023750, Backoffs=225
Thread 32968: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=13309
Thread 32968: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=13310
Thread 32968: Simple Loop - Millisecond elapsed (QPC)=4, Loops=10000000
Thread 32968: Simple Loop - Spins/Millisecond (QPC)=2500000
Thread 32968 completed. Local Spins: 215023750
Thread 22176 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 22176: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 19188, Spins=261023750, Backoffs=271
Thread 22176: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=13603
Thread 22176: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=13603
Thread 22176: Simple Loop - Millisecond elapsed (QPC)=3, Loops=10000000
Thread 22176: Simple Loop - Spins/Millisecond (QPC)=3333333
Thread 22176 completed. Local Spins: 261023750
Thread 15996 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 15996: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 22156, Spins=312023750, Backoffs=322
Thread 15996: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=14083
Thread 15996: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=14074
Thread 15996: Simple Loop - Millisecond elapsed (QPC)=3, Loops=10000000
Thread 15996: Simple Loop - Spins/Millisecond (QPC)=3333333
Thread 15996 completed. Local Spins: 312023750
Thread 30632 acquired the lock. Simulating work by sleeping 3 seconds...
Thread 30632: SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 25219, Spins=359023750, Backoffs=369
Thread 30632: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=14236
Thread 30632: SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=14237
Thread 30632: Simple Loop - Millisecond elapsed (QPC)=3, Loops=10000000
Thread 30632: Simple Loop - Spins/Millisecond (QPC)=3333333
Thread 30632 completed. Local Spins: 359023750
All threads completed. Total Spins: 1473166500
Press ENTER to end...
T-shooting Spinlocks in SQL Server
For more information on how to collect Spinlock t-shooting information, please see white paper - Diagnosing and Resolving Spinlock Contention on SQL Server. Appending B.
Alternatively, below is a spinlock data-collection and analysis script:
use tempdb
go
declare @cntr int = 0
if exists(select name from tempdb.sys.sysobjects where name like '#_spin_waits%')
drop table #_spin_waits
create table #_spin_waits
(
lock_name varchar(128)
,collisions bigint
,spins bigint
,sleep_time bigint
,backoffs bigint
,snap_time datetime
)
while (@cntr < 4)
begin
--capture the current stats
insert into #_spin_waits
(
lock_name
,collisions
,spins
,sleep_time
,backoffs
,snap_time
)
select name
,collisions
,spins
,sleep_time
,backoffs
,getdate()
from sys.dm_os_spinlock_stats
waitfor delay '00:00:10'
set @cntr = @cntr + 1
end
--Analysis
declare @cpu int
select @cpu = cpu_count from sys.dm_os_sys_info
--SPINLOCKS compute delta
select t2.[lock_name] as spinlock_name,
cast(cast(t2.spins as float) - cast(t1.spins as float) as bigint) delta_spins,
cast (cast(t2.Backoffs as float) - cast (t1.Backoffs as float) as bigint) delta_backoff,
DATEDIFF(MILLISECOND,t1.snap_time,t2.snap_time) delta_minuntes,
cast(cast(t2.spins as float) - cast(t1.spins as float) as bigint)/DATEDIFF(MILLISECOND,t1.snap_time,t2.snap_time)/@cpu delta_spins_per_millisecond_per_CPU
from
(select row_number () over ( partition by [lock_name] order by snap_time) row, * from [#_spin_waits]
where snap_time in
(select min(snap_time) from #_spin_waits) ) t1
join
(select row_number () over ( partition by [lock_name] order by snap_time) row, * from [#_spin_waits]
where snap_time in
(select max(snap_time) from #_spin_waits) ) t2
on t1.row = t2.row and t1.[lock_name]=t2.[lock_name]
where cast(cast(t2.spins as float) - cast(t1.spins as float) as bigint) > 0
order by delta_spins desc
Enjoy!
Updated Mar 31, 2025
Version 12.0Joseph_Pilov
Microsoft
Joined October 22, 2018
SQL Server Support Blog
Follow this blog board to get notified when there's new activity