create external resource pool "lcerp2" with (affinity numanode = (1));
create resource pool "lcrp1" with (affinity numanode = (0));
create resource pool "lcrp2" with (affinity numanode = (1));
create workload group "rg0" using "lcrp1", external "lcerp1";
create workload group "rg1" using "lcrp2", external "lcerp2";
USE [master]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE function [dbo].[assign_external_resource_pool]()
returns sysname
with schemabinding
as
begin
return concat('rg', @@SPID%2);
end;
GO
CREATE TABLE [dbo].[models](
[model] [varbinary](max) NOT NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY];
GO
INSERT INTO [dbo].[models]
EXEC sp_execute_external_script
@language = N'R',
@script = N'
randomForestObj <- rxDForest(is_bad ~ revol_util + int_rate + mths_since_last_record + annual_inc_joint + dti_joint + total_rec_prncp + all_util, InputDataSet)
model <- data.frame(payload = as.raw(serialize(randomForestObj, connection=NULL)))
',
@input_data_1 = N'SELECT revol_util, int_rate, mths_since_last_record, annual_inc_joint, dti_joint, total_rec_prncp, all_util,is_bad FROM [dbo].[LoanStats] WHERE (ABS(CAST((BINARY_CHECKSUM(id, NEWID())) as int)) % 100) < 75',
@output_data_1_name = N'model';
Once you have resource governor configured, you can create a PowerShell script which will spawn parallel threads to call the loan scoring stored procedures using an increment specified by you. The 6 - Score Loans.ps1 and 7 - WhatIf.ps1 PowerShell scripts available in the repository on GitHub spawn parallel threads using a while loop to executing the loan scoring stored procedure. The loan scoring stored procedure fetches data using ranges provided by the PowerShell script using the non-clustered columnstore index. Then it uses sp_execute_external_script to score the loans using the model which was created earlier. The scoring results are then stored in an in-memory schema-only table to minimize the transaction logging overhead associated with multiple parallel threads writing into the same database at a very high rate. Since the loan scoring rate is quite high, you can afford to store the results in an in-memory table provided you have sufficient RAM available.
The ScoreLoans stored procedure and the PowerShell script calling this stored procedure is available below.
-- Stored procedure for scoring loans for the base predictions
CREATE PROCEDURE [dbo].[ScoreLoans]
@start bigint,
@end bigint
AS
BEGIN
-- Declare the variables to get the input data and the scoring model
DECLARE @inquery nvarchar(max) = N'SELECT id,revol_util, int_rate, mths_since_last_record, annual_inc_joint, dti_joint, total_rec_prncp, all_util, is_bad FROM [dbo].[LoanStats] where [id] >= ' + CAST(@start as varchar(255)) + 'and [id] <= ' + CAST(@end as varchar(255));
DECLARE @model varbinary(max) = (SELECT TOP 1 [model] FROM [dbo].[models])
-- Log beginning of processing time
INSERT INTO [dbo].[RunTimeStats] VALUES (@@SPID, GETDATE(),'Start')
-- Score the loans and store them in a table
INSERT INTO [dbo].[LoanStatsPredictions]
EXEC sp_execute_external_script
@language = N'R',
@script = N'
rfModel <- unserialize(as.raw(model));
OutputDataSet<-rxPredict(rfModel, data = InputDataSet, extraVarsToWrite = c("id"))
',
@input_data_1 = @inquery,
@params = N'@model varbinary(max)',
@model = @model
-- Log end of processing time
INSERT INTO [dbo].[RunTimeStats] VALUES (@@SPID, GETDATE(),'End')
END
GO
# Create a while loop to start the SQL jobs to execute scoring procedure in parallel
$StartCtr = 1
$Increment = 250000
$EndCtr = $Increment
$FinalCount = 1195907
$vServerName = $env:computername
$vDatabaseName = "LendingClub"
$count = "{0:N0}" -f $FinalCount
Write-Host "Performing clean-up to start new scoring run...." -ForegroundColor Yellow
# Start Cleanup
Invoke-Sqlcmd -ServerInstance $vServerName -Database $vDatabaseName -Query "delete from [LoanStatsPredictions];delete from Runtimestats;checkpoint;"
Write-Host "Starting parallel jobs to score " $count "loans" -ForegroundColor Yellow
while ($EndCtr -le $FinalCount)
{
$SqlScript = [ScriptBlock]::Create("Invoke-Sqlcmd -ServerInstance `"" + $vServerName + "`" -Query `"EXEC [dbo].[ScoreLoans] " + $StartCtr + "," + $EndCtr + "`" -Database `"$vDatabaseName`"")
Start-Job -ScriptBlock $SqlScript
$StartCtr += $Increment
$EndCtr += $Increment
}
# Wait till jobs complete
while (Get-Job -State Running)
{
Start-Sleep 1
}
# Find out duration
$duration = Invoke-Sqlcmd -ServerInstance $vServerName -Database $vDatabaseName -Query "select DATEDIFF(s,MIN (Runtime), MAX(Runtime)) as RuntimeSeconds from dbo.RuntimeStats;"
Write-Host "`n"
$rate = "{0:N2}" -f ($FinalCount/$duration.RuntimeSeconds)
Write-Host "Completed scoring" $count "loans in" $duration.RuntimeSeconds "seconds at" $rate "loans/sec." -ForegroundColor Green
# Remove Jobs
Get-Job | Remove-Job
The WhatIf scenario is actually a very common scenario for a business user for modeling various scenarios and checking what the possible outcome will be. In this sample, the user is allowed to increase the interest rate of all the loans and check what the charge-off probability would be. Such WhatIf scenarios can be made to handle complex business scenarios and provides business users the capability to run various models using the power of SQL Server and R-Services and make informed decisions about their business. These type of implementations can turn the data in your data warehouse into a gold mine of business insights waiting to harnessed!
The above sample is one way of setting up a parallel workload for scoring ingested loans from a table using columnstore indexes to speed up data fetch/aggregation and using parallel processing to get a high throughput. On a machine with 32 logical processors with two NUMA nodes, I was able to get a throughput of ~298K loans/sec with only 9 parallel processes . The screenshot above shows a sample output.
REFERENCES
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.