Python - how do I bulk update file properties

Copper Contributor

Hi,

 

I have a requirement to bulk update 1.2 million files. I have the properties that need to be update in csv files and I have loaded those to pandas dataframe. I am currently able to find the correct file in the SharePoint document library and update the same one at a time. But this is extremely time consuming and I am only able to update 200 records per minute. 

 

I am unable to figure out how to load the file items in bulk (into a list or dict) and to call a bulk update of all the files within one case (the 1.2 million files are distributed over 20,000 cases). I have used 

execute_query_with_incremental_retry() to avoid throttling issue but still the performance of update is pretty bad. the code snippet. Some of the code is to allow me to restart from a point in case the update fails.

 

alreadyupdatedfiles =[]
for i in range(0,len(df),10):
    dfMasterFileFolderDocumentList_cin = dfMasterFileFolderDocumentList[(dfMasterFileFolderDocumentList['Case Number New']==df.loc[i, 'Case Number New'])]
    dfMasterFileFolderDocumentList_cin = dfMasterFileFolderDocumentList_cin.reset_index(drop= True)
    SharePointCaseListName = dfCaseLocation.loc[dfCaseLocation['Case Number New']==df.loc[i, 'Case Number New']]['SharePointListName'].item()
    SharePointDocumentListName = SharePointCaseListName.replace("Shared Cases","Shared Cases Documents")
    SharePointListInternalName = dfCaseLocation.loc[dfCaseLocation['Case Number New']==df.loc[i, 'Case Number New']]['SharePointListInternalName'].item()
    SharePointListInternalName = SharePointListInternalName.replace("Shared_Cases","SharedCasesDoc")
    if (df.loc[i, 'Case Completion Status']) == 'Completed':
        root = r'Lists/{0}/Completed'.format(SharePointListInternalName)
    else:
        root = r'Lists/{0}'.format(SharePointListInternalName)
    ctxCaseItem = ClientContext(site_url, context_auth)
    caseURL = dfCaseURL.loc[dfCaseURL['Title']==df.loc[i, 'Case Number New']]['Case Url'].item()
    caseFileName = dfCaseURL.loc[dfCaseURL['Title']==df.loc[i, 'Case Number New']]['Case File Name'].item()  
    
    for j in range(len(dfMasterFileFolderDocumentList_cin)):
        ctxFiles = ClientContext(site_url, context_auth)
        file_folder = base64.b64decode(str(dfMasterFileFolderDocumentList_cin.iloc[j,2])).decode('UTF-16', "ignore")
        file_folder = str(file_folder).replace('\\ ','\\').replace('/','_').replace('#','_').replace(':','_').replace('*','_').replace('*','_').replace('?','_').replace('|','_').replace('%','_').replace('>','_').replace('<','_').replace('"','_')
        if file_folder[len(file_folder)-1] =='.':
            file_folder = file_folder[:-1]
        fileNameActual = base64.b64decode(str(dfMasterFileFolderDocumentList_cin.loc[j,'FullFileName'])).decode('UTF-16', "ignore")
        fileNameActual =  str(fileNameActual).rstrip().replace('/','_').replace('#','_').replace(':','_').replace('*','_').replace('*','_').replace('?','_').replace('|','_').replace('%','_').replace('>','_').replace('<','_').replace('\\','_').replace('"','_').replace('\t','_')
        fullpath = r"{0}/{1}/{2}".format(root,file_folder,fileNameActual)
       
        if fullpath not in alreadyupdatedfiles:
            print(r"{0}/{1}/{2}".format(root,file_folder,fileNameActual))
            target_file = ctxFiles.web.get_file_by_server_relative_url(r"{0}/{1}/{2}".format(root,file_folder,fileNameActual)).listItemAllFields
            ctxFiles.load(target_file)
            ctxFiles.execute_query()
            DocumentTitle = base64.b64decode(str(dfMasterFileFolderDocumentList_cin.loc[j,'doc_title'])).decode('UTF-16', "ignore")
            ctxFileItem = ClientContext(site_url, context_auth)
            target_file_item = ctxFileItem.web.lists.get_by_title(SharePointDocumentListName).get_item_by_id(target_file.id)
            target_file_item.set_property("Title",DocumentTitle)
            target_file_item.set_property("CaseNumber",dfMasterFileFolderDocumentList_cin.loc[j,'Case Number New'])
            target_file_item.set_property("DocumentId",dfMasterFileFolderDocumentList_cin.loc[j,'Sequence Number'])
            if str(dfMasterFileFolderDocumentList_cin.loc[j,'Document_Type SPID']) != 'nan':
                target_file_item.set_property("DocumentTypeLKId",str(dfMasterFileFolderDocumentList_cin.loc[j,'Document_Type SPID']))
            target_file_item.set_property("CaseUrl",caseURL)
            if str(dfMasterFileFolderDocumentList_cin.loc[j,'DownloadList']) != 'nan':
                target_file_item.set_property("DownloadLists",{ 'results': str(dfMasterFileFolderDocumentList_cin.loc[j,'DownloadList']).split(',') })
            target_file_item.update()
            ctxFileItem.execute_query_with_incremental_retry()
            alreadyupdatedfiles.append(r"{0}/{1}/{2}".format(root,file_folder,fileNameActual))
    

 

0 Replies