Generate searchable PDFs with Azure Form Recognizer

Microsoft

May 13, 2024

I had offline discussion with ramprasadgajula and provided code sample of Azure Function which produces searchable PDF and using different method to edit existing PDF instead of rendering it. Sharing this code sample here, in case it can benefit others.

This solution uses different approach and modifies existing PDF using PyMuPDF package instead of rendering it using pdf2image. It would not for PDFs which have a mix of digital text and images since text will be duplicated. Also, it has some chances to hit some bugs on text alignment for complex PDFs, since PDF editing is more complex than rendering. Even this solution is less generic vs PDF rendering (described in original blog post), it has a few advantages:

No dependence on Poppler (pdf2image) binaries and allow to run as Azure Function easily (see code below)
Kept existing PDF structure/metadata stays as is (like annotations, objects, etc).
PDF file size has predictable increase in size. It increases just slightly due to text elements (vs rendering with specific DPI/jpeg quality).
Code requires very little CPU resource to add text elements vs rendering/image encoding.

Very basic Azure Function implementation with HTTP trigger, pass URL of the file as query parameter “fileurl” and it returns searchable PDF file, so it can be seen right in the browser:

https://%%YOUR_FUNCTION%%.azurewebsites.net/api/searchable_pdf?fileurl=https://documentintelligence.ai.azure.com/documents/samples/document/generaldoc.pdf&code=%%YOUR_AZURE_FUNCTION_CODE%%

Azure function will need to add FR_ENDPOINT and FR_KEY into your Azure Function environment variables.

function_app.py:

import azure.functions as func
import logging
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import os
import math
import fitz # PyMuPDF 
import requests
import traceback
from urllib.parse import urlparse

app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
logger = logging.getLogger('azure')

# Function to adjust text position based on page rotation
def adjust_position_for_rotation(page, position):
    rotation = page.rotation
    if rotation == 90:
        return position[1], page.rect.width - position[0]
    elif rotation == 180:
        return page.rect.width - position[0], page.rect.height - position[1]
    elif rotation == 270:
        return page.rect.height - position[1], position[0]
    else:
        return position[0], position[1]
    return position  # No rotation or 0 degrees
    
def dist(p1, p2):
    return math.sqrt((p1.x - p2.x)*(p1.x - p2.x) + (p1.y - p2.y) * (p1.y - p2.y))

@app.route(route="searchable_pdf")
def searchable_pdf(req: func.HttpRequest) -> func.HttpResponse:
    try:
        logging.info('Python HTTP trigger function processed a request.')

        logging.info('Getting FR keys...')
        endpoint = os.environ["FR_ENDPOINT"]
        key = os.environ["FR_KEY"]
        logging.info(f'{endpoint} resource will be used later.')

        fileurl = req.params.get('fileurl')
        if not fileurl:
            return func.HttpResponse(
                "Please pass a fileurl on the query string",
                status_code=400
            )
        
        fileurl = urlparse(fileurl)
        
        logger.info(f"Downloading {fileurl}")
        file_get = requests.get(fileurl.geturl())
        if file_get.status_code != 200:
            return func.HttpResponse(
                f"Failed to download file from {fileurl.geturl()}. Status code: {file_get.status_code}",
                status_code=400
            )
        
        searchable_file_name = os.path.basename(fileurl.path) + ".ocr.pdf"
        logger.info(f"Downloaded")
        logger.info(f"Loading pdf as {searchable_file_name}...")
        existing_pdf = fitz.open(searchable_file_name, file_get.content)
        logger.info(f"Loaded {len(existing_pdf)} pages from PDF file.")
        logger.info(f"Starting Azure Form Recognizer OCR process...")
        document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key), headers={"x-ms-useragent": "searchable-pdf-blog/1.0.7"})
        poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-read", document_url = fileurl.geturl())
        ocr_results = poller.result()
        logger.info(f"Azure Form Recognizer finished OCR text for {len(ocr_results.pages)} pages.")

        # Generate OCR overlay layer
        logger.info(f"Generating searchable PDF...")
            
        for page_id, page in enumerate(ocr_results.pages):
            # Calculate PDF page size and scale
            existing_pdf_page = existing_pdf[page_id]
            existing_pdf_page.wrap_contents()
            
            page_width = float(existing_pdf_page.rect.width)
            page_height = float(existing_pdf_page.rect.height)
            scale = 1.0 * (page_width + page_height) / (page.width + page.height)

            shape = existing_pdf_page.new_shape()
            for line in page.words:
                # Calculate optimal font size
                desired_text_width = max(dist(line.polygon[0], line.polygon[1]), dist(line.polygon[3], line.polygon[2])) * scale
                desired_text_height = max(dist(line.polygon[1], line.polygon[2]), dist(line.polygon[0], line.polygon[3])) * scale
                font_size = desired_text_height
                actual_text_width = fitz.get_text_length(line.content, fontsize=font_size)
                
                # Calculate text rotation angle
                text_angle = math.atan2((line.polygon[1].y - line.polygon[0].y + line.polygon[2].y - line.polygon[3].y) / 2.0, 
                                        (line.polygon[1].x - line.polygon[0].x + line.polygon[2].x - line.polygon[3].x) / 2.0)
                
                matrix = fitz.Matrix(fitz.Identity)
                matrix.prerotate(-math.degrees(text_angle) + existing_pdf_page.rotation)
                matrix.prescale(desired_text_width / actual_text_width, 1)
                pos = fitz.Point(adjust_position_for_rotation(existing_pdf_page, (line.polygon[3].x * scale, line.polygon[3].y * scale)))
                
                shape.insert_text(pos, line.content, fontsize=font_size, render_mode=3, morph = (pos, matrix))
            
            shape.commit(overlay=True)  

        pdf_bytes = existing_pdf.tobytes(deflate=True, linear=True, garbage = 4)
        return func.HttpResponse(pdf_bytes, status_code=200, mimetype="application/pdf", headers={"Content-Disposition": f"filename={searchable_file_name}"})
    except Exception as e:
        logger.exception("An error occurred:")
        return func.HttpResponse(
             f"An error occurred:\n{traceback.format_exc()}",
             status_code=400
        )

requirements.txt

# DO NOT include azure-functions-worker in this file
# The Python Worker is managed by Azure Functions platform
# Manually managing azure-functions-worker may cause unexpected issues

azure-functions
azure-ai-formrecognizer
pymupdf==1.23.* # latest version has some issue with coordinate system
requests

Blog Post

Generate searchable PDFs with Azure Form Recognizer