Forum Discussion
prathyoo
Jan 29, 2024Copper Contributor
Python - how do I bulk update file properties
Hi,
I have a requirement to bulk update 1.2 million files. I have the properties that need to be update in csv files and I have loaded those to pandas dataframe. I am currently able to find the correct file in the SharePoint document library and update the same one at a time. But this is extremely time consuming and I am only able to update 200 records per minute.
I am unable to figure out how to load the file items in bulk (into a list or dict) and to call a bulk update of all the files within one case (the 1.2 million files are distributed over 20,000 cases). I have used
execute_query_with_incremental_retry() to avoid throttling issue but still the performance of update is pretty bad. the code snippet. Some of the code is to allow me to restart from a point in case the update fails.
alreadyupdatedfiles =[]
for i in range(0,len(df),10):
dfMasterFileFolderDocumentList_cin = dfMasterFileFolderDocumentList[(dfMasterFileFolderDocumentList['Case Number New']==df.loc[i, 'Case Number New'])]
dfMasterFileFolderDocumentList_cin = dfMasterFileFolderDocumentList_cin.reset_index(drop= True)
SharePointCaseListName = dfCaseLocation.loc[dfCaseLocation['Case Number New']==df.loc[i, 'Case Number New']]['SharePointListName'].item()
SharePointDocumentListName = SharePointCaseListName.replace("Shared Cases","Shared Cases Documents")
SharePointListInternalName = dfCaseLocation.loc[dfCaseLocation['Case Number New']==df.loc[i, 'Case Number New']]['SharePointListInternalName'].item()
SharePointListInternalName = SharePointListInternalName.replace("Shared_Cases","SharedCasesDoc")
if (df.loc[i, 'Case Completion Status']) == 'Completed':
root = r'Lists/{0}/Completed'.format(SharePointListInternalName)
else:
root = r'Lists/{0}'.format(SharePointListInternalName)
ctxCaseItem = ClientContext(site_url, context_auth)
caseURL = dfCaseURL.loc[dfCaseURL['Title']==df.loc[i, 'Case Number New']]['Case Url'].item()
caseFileName = dfCaseURL.loc[dfCaseURL['Title']==df.loc[i, 'Case Number New']]['Case File Name'].item()
for j in range(len(dfMasterFileFolderDocumentList_cin)):
ctxFiles = ClientContext(site_url, context_auth)
file_folder = base64.b64decode(str(dfMasterFileFolderDocumentList_cin.iloc[j,2])).decode('UTF-16', "ignore")
file_folder = str(file_folder).replace('\\ ','\\').replace('/','_').replace('#','_').replace(':','_').replace('*','_').replace('*','_').replace('?','_').replace('|','_').replace('%','_').replace('>','_').replace('<','_').replace('"','_')
if file_folder[len(file_folder)-1] =='.':
file_folder = file_folder[:-1]
fileNameActual = base64.b64decode(str(dfMasterFileFolderDocumentList_cin.loc[j,'FullFileName'])).decode('UTF-16', "ignore")
fileNameActual = str(fileNameActual).rstrip().replace('/','_').replace('#','_').replace(':','_').replace('*','_').replace('*','_').replace('?','_').replace('|','_').replace('%','_').replace('>','_').replace('<','_').replace('\\','_').replace('"','_').replace('\t','_')
fullpath = r"{0}/{1}/{2}".format(root,file_folder,fileNameActual)
if fullpath not in alreadyupdatedfiles:
print(r"{0}/{1}/{2}".format(root,file_folder,fileNameActual))
target_file = ctxFiles.web.get_file_by_server_relative_url(r"{0}/{1}/{2}".format(root,file_folder,fileNameActual)).listItemAllFields
ctxFiles.load(target_file)
ctxFiles.execute_query()
DocumentTitle = base64.b64decode(str(dfMasterFileFolderDocumentList_cin.loc[j,'doc_title'])).decode('UTF-16', "ignore")
ctxFileItem = ClientContext(site_url, context_auth)
target_file_item = ctxFileItem.web.lists.get_by_title(SharePointDocumentListName).get_item_by_id(target_file.id)
target_file_item.set_property("Title",DocumentTitle)
target_file_item.set_property("CaseNumber",dfMasterFileFolderDocumentList_cin.loc[j,'Case Number New'])
target_file_item.set_property("DocumentId",dfMasterFileFolderDocumentList_cin.loc[j,'Sequence Number'])
if str(dfMasterFileFolderDocumentList_cin.loc[j,'Document_Type SPID']) != 'nan':
target_file_item.set_property("DocumentTypeLKId",str(dfMasterFileFolderDocumentList_cin.loc[j,'Document_Type SPID']))
target_file_item.set_property("CaseUrl",caseURL)
if str(dfMasterFileFolderDocumentList_cin.loc[j,'DownloadList']) != 'nan':
target_file_item.set_property("DownloadLists",{ 'results': str(dfMasterFileFolderDocumentList_cin.loc[j,'DownloadList']).split(',') })
target_file_item.update()
ctxFileItem.execute_query_with_incremental_retry()
alreadyupdatedfiles.append(r"{0}/{1}/{2}".format(root,file_folder,fileNameActual))
No RepliesBe the first to reply