Lesson Learned #234: Parallel vs Single Running a Bulk Insert with Python

Microsoft

Aug 18, 2022

Today, We've been working on a service request that our customer wants to improve the performance of a bulk insert process. Following, I would like to share my experience working on that.

Our customer mentioned that inserting data (100.000 rows) is taking 14 seconds in a database in Business Critical. I was able to reproduce this time using a single thread using a table with 20 columns.

In order to improve this Python code, I suggested to run in parallel this bulk insert every batch size of 10.000 rows and also, I followed the best practices reducing the execution time of this process:

Client Virtual Machine level:

Accelerated networking enabled.
Depending how many parallel process that I needed create a CPU/Vcore, in this case, 10 vCores.
Placed the virtual machine in the same region that the DB is.

Database level:

Create a table with 20 columns.
As the PK is a sequential key I included in the clustered index definition the parameter OPTIMIZE_FOR_SEQUENTIAL_KEY = ON
Configure the same number of CPU/vCores with the maximum number of parallel process that I would like to have. In this case, 10 vCores.
Depeding on amount of data use Business Critical to reduce the storage latency.

Python code level:

Using executemany method in order to reduce the network roundtrips, sending only the value of the parameters.
Running in batches (1000,10000) instead a single process.
Use SET NOCOUNT ON to reduce the replied response/rowset about how many rows were inserted.
In the connectionstring use autocommit=False

Example of python code that you could find here. This Python reads a CSV file and for every 10000 rows execute a bulk insert using thread pool.

import csv
import pyodbc
import threading
import os
import datetime 

class ThreadsOrder: #Class to run in parallel the process.
    def ExecuteSQL(self,a,s,n):
        TExecutor = threading.Thread(target=ExecuteSQL,args=(a,s,n,))
        TExecutor.start()

def SaveResults( Message, bSaveFile): #Save the details of the file.
    try:
        print(Message)

        if (bSaveFile==True):
            file_object  = open(filelog, "a") 
            file_object.write(datetime.datetime.strftime(datetime.datetime.now(), '%d/%m/%y %H:%M:%S') + '-' + Message + '\n' )
            file_object.close()
    except BaseException as e:
        print('And error occurred - ' , format(e))


def ExecuteSQLcc(sTableName):
    try:
        cnxn1 = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};APP=Bulk Insert Test;SERVER=" + SQL_server + ";DATABASE=" + SQL_database + ";UID=" +SQL_user+';PWD='+ SQL_password, autocommit=False, Timeout=3600)
        cursor = cnxn1.cursor()
        cursor.execute("DROP TABLE IF EXISTS" + sTableName )
        cursor.commit()
        cursor.execute("CREATE TABLE " + sTableName + " (" \
                            "	[Key] [int] NOT NULL," \
                            "	[Num_TEST] [int] NULL," \
                            "	[TEST_01] [varchar](6) NULL," \
                            "	[TEST_02] [varchar](6) NULL," \
                            "	[TEST_03] [varchar](6) NULL," \
                            "	[TEST_04] [varchar](6) NULL," \
                            "	[TEST_05] [varchar](6) NULL," \
                            "	[TEST_06] [varchar](6) NULL," \
                            "	[TEST_07] [varchar](6) NULL," \
                            "	[TEST_08] [varchar](6) NULL," \
                            "	[TEST_09] [varchar](6) NULL," \
                            "	[TEST_10] [varchar](6) NULL," \
                            "	[TEST_11] [varchar](6) NULL," \
                            "	[TEST_12] [varchar](6) NULL," \
                            "	[TEST_13] [varchar](6) NULL," \
                            "	[TEST_14] [varchar](6) NULL," \
                            "	[TEST_15] [varchar](6) NULL," \
                            "	[TEST_16] [varchar](6) NULL," \
                            "	[TEST_17] [varchar](6) NULL," \
                            "	[TEST_18] [varchar](6) NULL," \
                            "	[TEST_19] [varchar](6) NULL," \
                            "	[TEST_20] [varchar](6) NULL)")
        cursor.commit()
        cursor.execute("CREATE CLUSTERED INDEX [ix_ms_example] ON " + sTableName + " ([Key] ASC) WITH (STATISTICS_NORECOMPUTE = OFF, DROP_EXISTING = OFF, ONLINE = OFF, OPTIMIZE_FOR_SEQUENTIAL_KEY = ON) ON [PRIMARY]")
        cursor.commit()
    except BaseException as e:
            SaveResults('Executing SQL - an error occurred - ' + format(e),True)


def ExecuteSQL(a,sTableName,n):
    try:
        Before = datetime.datetime.now()   
        
        if n==-1:
            sTypeProcess = "NoAsync"
        else:
            sTypeProcess="Async - Thread:" + str(n) 
        SaveResults('Executing at ' + str(Before) + " Process Type: " + sTypeProcess, True )      
        cnxn1 = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};APP=Bulk Insert Test;SERVER=" + SQL_server + ";DATABASE=" + SQL_database + ";UID=" +SQL_user+';PWD='+ SQL_password, autocommit=False, Timeout=3600)

        cursor = cnxn1.cursor()
        cursor.fast_executemany = True
        cursor.executemany("SET NOCOUNT ON;INSERT INTO " + sTableName +" ([Key], Num_TEST, TEST_01, TEST_02, TEST_03, TEST_04, TEST_05, TEST_06, TEST_07, TEST_08, TEST_09, TEST_10, TEST_11, TEST_12, TEST_13, TEST_14, TEST_15, TEST_16, TEST_17, TEST_18, TEST_19, TEST_20) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",a)                          
        cursor.commit()
        SaveResults('Time Difference INSERT process ' + str(datetime.datetime.now() - Before) + " " + sTypeProcess, True )  

    except BaseException as e:
            SaveResults('Executing SQL - an error occurred - ' + format(e),True)


#Connectivity details.
SQL_server = 'tcp:servername.database.windows.net,1433'
SQL_database = 'databasename'
SQL_user = 'username'
SQL_password = 'password'


#file details to read 
filepath = 'c:\\k\\' ##To Read the demo file
filelog = filepath + '\\Error.log' #Save the log.

chunksize = 10000 #Transaction batch rows.
sTableName = "[test_data]" #Table Name (dummy)

pThreadOrder = ThreadsOrder()
nThread = 0 #Number of Threads -- Right now, we provided an unlimited threads.

ExecuteSQLcc(sTableName)

Before = datetime.datetime.now()  
line_count = 0         
for directory, subdirectories, files in os.walk(filepath):
    for file in files:
      name, ext = os.path.splitext(file)
      if ext == '.csv': 
            a=[]
            SaveResults('Reading the file ' + name ,True)
            BeforeFile= datetime.datetime.now()             
            with open(os.path.join(directory,file), mode='r') as csv_file:
              csv_reader = csv.reader(csv_file, delimiter=',')
              for row in csv_reader:
                    line_count+= 1  
                    if line_count>1:    
                        a.append(row)

                    if (line_count%chunksize)==0:
                        deltaFile = datetime.datetime.now() - BeforeFile
                        nThread=nThread+1                        
                        SaveResults('Time Difference Reading file is ' + str(deltaFile) + ' for ' + str(line_count) + ' rows', True )                        
                        pThreadOrder.ExecuteSQL(a,sTableName,nThread) #Open a new theard per transaction batch size.
                        #ExecuteSQL(a,sTableName,-1)   
                        a=[]
            BeforeFile= datetime.datetime.now() 

SaveResults('Total Time Difference Reading file is ' + str(datetime.datetime.now() - Before) + ' for ' + str(line_count) + ' rows for the file: ' + name , True )

During the execution if you need to know the connections, number of rows and the impact in terms of resources see the following TSQL

SELECT
 substring(REPLACE(REPLACE(SUBSTRING(ST.text, (req.statement_start_offset/2) + 1, (
(CASE statement_end_offset WHEN -1 THEN DATALENGTH(ST.text) ELSE req.statement_end_offset END
- req.statement_start_offset)/2) + 1) , CHAR(10), ' '), CHAR(13), ' '), 1, 512) AS statement_text
,req.database_id
,program_name
,req.session_id
, req.cpu_time 'cpu_time_ms'
, req.status
, wait_time
, wait_resource
, wait_type
, last_wait_type
, req.total_elapsed_time
, total_scheduled_time
, req.row_count as [Row Count]
, command
, scheduler_id
, memory_usage
, req.writes
, req.reads
, req.logical_reads, blocking_session_id
FROM sys.dm_exec_requests AS req
inner join sys.dm_exec_sessions as sess on sess.session_id = req.session_id
CROSS APPLY sys.dm_exec_sql_text(req.sql_handle) as ST
where req.session_id <> @@SPID

select count(*) from test_data

select * from sys.dm_db_resource_stats order by end_time desc

Enjoy!

Updated Aug 19, 2022

Version 5.0

Python and BulkInsert

Jose_Manuel_Jurado

Microsoft

Joined November 29, 2018

View Profile

Azure Database Support Blog

Follow this blog board to get notified when there's new activity