Hi,
thank you for this guide.
I don't know how running multiple instance of Script 2 in parallel would increase the speed by any significant amount. The bottleneck is listing and identifying the blobs that have index tags, not the removal of those tags. Unfortunately I had several million blobs to process and the script did not finish even after weeks, so I improved it a bit:
- walk the blobs in hierarchical order, so it can be restarted with a prefix
- add logging to track the progress
- remove one call to the blob service to increase speed
# Please update the below parameters with your own information before executing this script:
# account_name: Storage account name.
# account_key: Storage account key.
# container_name: Name of the container where the blobs with index tags are.
from azure.storage.blob import BlobServiceClient, BlobPrefix, ContainerClient
from concurrent.futures import ThreadPoolExecutor
import os
# Define your storage account name and key, and the container name
account_name = "XXX"
account_key = "XXX"
prefix = "XXX"
container_name = "XXX"
# Define the number of concurrent threads
concurrency = 250
# Count the number of blob with index tags
blob_count = 0
# Create a BlobServiceClient object
blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key)
# Function to remove index tag from a blob
def remove_blob_index_tag(blob_name):
# Get the blob client
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
# Remove the index tag
blob_client.set_blob_tags(tags=None)
print(f"This script will remove index tags on blobs with prefix {prefix}", flush=True)
# Create a ThreadPoolExecutor with the specified concurrency
with ThreadPoolExecutor(max_workers=concurrency) as executor:
container_client = blob_service_client.get_container_client(container_name)
# recursive function to remove index tags from all blobs in order hierarchically
def remove_index_tags_hierarchical(container_client: ContainerClient, prefix, depth):
global blob_count, executor
for blob in container_client.walk_blobs(name_starts_with=prefix, delimiter='/', include=['tags']):
if isinstance(blob, BlobPrefix):
# log progress only to a depth of 3 so we don't spam
if depth < 4:
indent = " "
print(f"{indent * depth}{blob.name} (blob count: {blob_count})", flush=True)
remove_index_tags_hierarchical(container_client, blob.name, depth+1)
else:
# Check if index tag exists
if blob['tags']:
futures = [executor.submit(remove_blob_index_tag, blob.name)]
blob_count += 1
remove_index_tags_hierarchical(container_client, prefix, 0)
print(f"This script removed index tags on {blob_count} blobs", flush=True)
My rough estimate shows that this can remove index tags from about 10-20 million blobs per 24h, if run in the same azure region as the storage account.