When is Spinlock a Significant Driver of CPU utilization in SQL Server?
Published May 03 2019 03:02 PM 7,494 Views
Microsoft
When is Spinlock a Significant Driver of CPU utilization?

SQL Server uses multiple mechanisms for synchronizing access to shared structures. One such mechanism is a spinlock. Briefly, a spinlock is a type of lock acquisition mechanism which is designed to avoid context switching . In essence, instead of going to sleep (Sleep()) or waiting to be signaled (WaitForSingleObject()), a thread performs "idle" loops on the CPU and periodically checks if the lock becomes available so it can acquire it. Think of it as "Spin until I can get the lock". The goal is to avoid thread Context switching which is an expensiveness operation from an OS perspective. Spinlocks use an optimistic approach assuming that it won't take "too many" loops or spins until the lock is available. Spinlocks can become a problem if this assumption is not true, that is, an excessive number of spins are performed leading to high CPU usage.
For more details about Spinlocks, their implementation and t-shooting, please review the following white paper: Diagnosing and Resolving Spinlock Contention on SQL Server. Here is a short description from the paper:
 
"Spinlocks are lightweight synchronization primitives which are used to protect access to data structures. Spinlocks are not unique to SQL Server. They are generally used when it is expected that access to a given data structure will need to be held for a very short period of time. When a thread attempting to acquire a spinlock is unable to obtain access it executes in a loop periodically checking to determine if the resource is available instead of immediately yielding. After some period of time a thread waiting on a spinlock will yield before it is able to acquire the resource in order to allow other threads running on the same CPU to execute. This is known as a backoff and will be discussed in more depth later in this paper."
 
The question this post will attempt to address is "How many spins is too many?" or "When do we consider spinlocks to be a significant driver of CPU usage"
 
The key to answering this is examining "Spins per second per CPU". If this number exceeds approximately 20000 spins per millisecond on a single CPU, then the spinlock has reached a level where it impacts CPU usage significantly. How do I arrive at this number?
 
Spinlocks are a little more than regular for/while loops, so if you measure approximately how many tight loops a CPU can perform in a unit of time - say millisecond - then you would know what is a reasonable threshold to compare against.
The following sample C++ application accomplishes just that on a single CPU (it introduces some pause similar to spinlock code) (get entire solution at GitHub )
 
#include 
#include 
#include 
#include    
#include 

//make this global for easy access
LONG volatile lockValue = 1;
const UINT64 MIN_SPIN = 250;
const UINT64 MAX_SPIN = 1000000;
UINT64 globalSpins = 0;


BOOL GetLock(
	__in LONG Id)			// I Thread id and additional information
{
	assert(Id != 0);

	UINT64 oldId = InterlockedCompareExchangeAcquire(&lockValue, Id, 0);

	return oldId == 0;
}

void ReleaseLock( void *)
{
	//sleep an arbitrarily hard-coded time
	Sleep(5000);

	//set the value to 0 so the lock is released and spinlock acquires the lock
	lockValue = 0; 

}

void
SpinToAcquireLockWithExponentialBackoff(__in DWORD Id, __out UINT32 * BackoffCount)			
{

	UINT32	Backoffs = 0;

	UINT64 spinLimit = MIN_SPIN;

	UINT64 iSpin = 0;

	while (true)
	{
		assert(spinLimit <= MAX_SPIN && spinLimit >= MIN_SPIN);

		// Spin for a while without reading the lock.
		//
		for (iSpin = 0; iSpin < spinLimit; iSpin++)
		{
	
			_mm_pause(); 
		}

		//printf_s("iSpin = %ld\n", iSpin);

		//increment the global spins
		globalSpins = globalSpins + iSpin;

		//printf_s("globalSpins = %ld\n", globalSpins);


		// Try to get the lock.
		if (GetLock(Id))
		{
			break;
		}

		//
		spinLimit = min(MAX_SPIN, spinLimit * 2);

		// See if we need to backoff
		//
		Backoffs++;

	}

	//output the final backoffs
	*BackoffCount = Backoffs; 

} // SpinToAcquireLockWithExponentialBackoff


void ExerciseSpinLockCode()
{
	UINT32 backoffCount = 0;
	LARGE_INTEGER beforeHighRes, afterHighRes, elapsedMicroseconds, frequency;

	//spawn a thread to simulate lock release
	HANDLE hThread = (HANDLE)_beginthread(ReleaseLock, 0, NULL);

	//get start times
	QueryPerformanceCounter(&beforeHighRes);
	long int before = GetTickCount();
	
	//invoke Spinlock code
	SpinToAcquireLockWithExponentialBackoff(GetCurrentThreadId(), &backoffCount);
	
	//get the end times
	long int after = GetTickCount();
	QueryPerformanceCounter(&afterHighRes);
	QueryPerformanceFrequency(&frequency);

	//calculate microseconds
	//ticks divided by ticks-per-second (frequency) , gives us seconds
	//converted to microseconds by multiplying to 1 mil
	
	elapsedMicroseconds.QuadPart = (afterHighRes.QuadPart - beforeHighRes.QuadPart);
	elapsedMicroseconds.QuadPart *= 1000000;  
	elapsedMicroseconds.QuadPart /= frequency.QuadPart;

	UINT64 SpinsPerMillisecond = (globalSpins) / (after - before);

	printf_s("SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = %ld, Spins=%I64d, Backoffs=%ld\n", after - before, globalSpins, backoffCount);
	printf_s("SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=%I64d\n", SpinsPerMillisecond);
	printf_s("SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=%I64d\n", globalSpins/(elapsedMicroseconds.QuadPart/1000));

	//wait on the thread completion
	WaitForSingleObject(hThread, INFINITE);
}

void ExerciseSimpleLoopCode()
{
	long i=0;
	LARGE_INTEGER beforeHighRes, afterHighRes, elapsedMicroseconds, frequency;
	
	QueryPerformanceCounter(&beforeHighRes);

	//loop, no pause
	for (i; i < MAX_SPIN*10; i++)
	{
		;
	}


	QueryPerformanceCounter(&afterHighRes);
	QueryPerformanceFrequency(&frequency);

	
	//https://docs.microsoft.com/en-us/windows/desktop/SysInfo/acquiring-high-resolution-time-stamps
	elapsedMicroseconds.QuadPart = (afterHighRes.QuadPart - beforeHighRes.QuadPart);
	elapsedMicroseconds.QuadPart *= 1000000;
	elapsedMicroseconds.QuadPart /= frequency.QuadPart;


	long elapsedTimeQPC = elapsedMicroseconds.QuadPart / 1000 == 0 ? 1 : (elapsedMicroseconds.QuadPart / 1000);

	printf_s("Simple Loop: Millisecond elapsed (QPC)=%lld, Loops=%d\n", (elapsedMicroseconds.QuadPart / 1000), i);
	printf_s("Simple Loop: Spins/Millisecond (QPC)=%ld\n", i / elapsedTimeQPC);

}

int main()
{
	ExerciseSpinLockCode();

	ExerciseSimpleLoopCode();

	printf_s("Press ENTER to end...");
	_gettchar();

	return 0;
}
 
 

Sample Results 1 (Intel i7 2.7 Ghz
=======================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5031, Spins=102023750, Backoffs=112
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=20279
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=20315
Simple Loop: Millisecond elapsed (QPC)=4, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=2500000
Press ENTER to end...
 
Sample Results 2 (AMD A6-6310 APU 1.8 Ghz
===============================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5016, Spins=187023750, Backoffs=197
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=37285
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=37292
Simple Loop: Millisecond elapsed (QPC)=11, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=909090
Press ENTER to end...
 
Sample Results 3 (Intel Q6600 2.4 Ghz)
===========================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5015, Spins=916023750, Backoffs=926
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=182656
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=183058
Simple Loop: Millisecond elapsed (QPC)=4, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=2500000
Press ENTER to end...
 
Sample Results 4 (Intel Xeon E5-2620 v4 2.10 Ghz)
==================================
SpinToAcquireLockWithExponentialBackoff: Milliseconds elapsed = 5016, Spins=81123750, Backoffs=821
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond=161687
SpinToAcquireLockWithExponentialBackoff: Spins/Millisecond(QPC)=161977
Simple Loop: Millisecond elapsed (QPC)=4, Loops=10000000
Simple Loop: Spins/Millisecond (QPC)=2500000
Press ENTER to end...
 

Since the above example is designed to perform a little more than a loop with periodic pauses until the lock is acquired, it would drive CPU utilization to 100% on a single CPU.
Based on these values one can conclude that if spinlock statistics show values greater than about 20000  spins per millisecond per CPU, then the CPU usage is likely fairly high and close to 100% capacity.
 
Naturally, CPU clock speeds would impact the results and faster CPUs would be able to perform more spins per millisecond. Other CPU-related specifications like L1, L2, L3 caches may also play somewhat of a role here. However, the goal of this post is to provide guidance, a "ballpark" figure so to speak. Please feel free to report your test results in the Comments section.

For more information on how to collect Spinlock t-shooting information, please see above white paper - Appending B.
Alternatively, below is a spinlock data-collection and analysis script:
 
use tempdb
go
declare @cntr int = 0
if exists(select name from tempdb.sys.sysobjects where name like '#_spin_waits%')
drop table #_spin_waits


create table #_spin_waits
(
lock_name varchar(128)
,collisions bigint
,spins bigint
,sleep_time bigint
,backoffs bigint
,snap_time datetime
)

while (@cntr < 4)
begin
	--capture the current stats
	insert into #_spin_waits
	(
		lock_name
		,collisions
		,spins
		,sleep_time
		,backoffs
		,snap_time
	)
	select name
		,collisions
		,spins
		,sleep_time
		,backoffs
		,getdate()
	from sys.dm_os_spinlock_stats

	waitfor delay '00:00:10'
	set @cntr = @cntr + 1

end

--Analysis

declare @cpu int
select @cpu = cpu_count from sys.dm_os_sys_info

--SPINLOCKS compute delta 
select  t2.[lock_name] as spinlock_name,  
cast(cast(t2.spins as float) - cast(t1.spins as float) as bigint) delta_spins,  
cast (cast(t2.Backoffs as float) - cast (t1.Backoffs as float) as bigint) delta_backoff, 
DATEDIFF(MILLISECOND,t1.snap_time,t2.snap_time) delta_minuntes,
cast(cast(t2.spins as float) - cast(t1.spins as float) as bigint)/DATEDIFF(MILLISECOND,t1.snap_time,t2.snap_time)/@cpu delta_spins_per_millisecond_per_CPU

from 
(select row_number () over ( partition by [lock_name]  order by    snap_time) row, *  from [#_spin_waits] 
	where snap_time in 
	(select min(snap_time) from #_spin_waits) ) t1
join 
(select row_number () over ( partition by [lock_name]  order by    snap_time) row, *  from [#_spin_waits]  
	where snap_time in 
	(select max(snap_time) from #_spin_waits) ) t2
on t1.row = t2.row and t1.[lock_name]=t2.[lock_name]
where cast(cast(t2.spins as float) - cast(t1.spins as float) as bigint) > 0
order by delta_spins desc

 

Enjoy!

 

 
Co-Authors
Version history
Last update:
‎Jan 28 2023 12:19 PM
Updated by: