Blog Pages

PDF Page Comparison Tool

 import fitz  # PyMuPDF

from PIL import Image, ImageChops, ImageDraw

import difflib

import os


OUTPUT_DIR = "pdf_diff_output"

os.makedirs(OUTPUT_DIR, exist_ok=True)


# ----------------------------------------------------

# Extract text from a PDF page

# ----------------------------------------------------

def extract_text(pdf_path, page_num):

    doc = fitz.open(pdf_path)

    if page_num >= len(doc):

        return ""

    return doc[page_num].get_text()


# ----------------------------------------------------

# Render page as image

# ----------------------------------------------------

def render_page_image(pdf_path, page_num, zoom=2):

    doc = fitz.open(pdf_path)

    page = doc[page_num]

    mat = fitz.Matrix(zoom, zoom)

    pix = page.get_pixmap(matrix=mat)

    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    return img


# ----------------------------------------------------

# Text Difference

# ----------------------------------------------------

def text_diff(text1, text2):

    diff = difflib.unified_diff(

        text1.splitlines(),

        text2.splitlines(),

        lineterm=""

    )

    return "\n".join(diff)


# ----------------------------------------------------

# Image Difference (highlight changes)

# ----------------------------------------------------

def image_diff(img1, img2):

    diff = ImageChops.difference(img1, img2)

    diff = diff.convert("RGB")


    # Highlight differences in red

    pixels = diff.load()

    for y in range(diff.height):

        for x in range(diff.width):

            r, g, b = pixels[x, y]

            if r + g + b > 50:

                pixels[x, y] = (255, 0, 0)


    return diff


# ----------------------------------------------------

# Compare PDFs

# ----------------------------------------------------

def compare_pdfs(pdf1, pdf2):

    doc1 = fitz.open(pdf1)

    doc2 = fitz.open(pdf2)


    total_pages = max(len(doc1), len(doc2))


    for page in range(total_pages):

        print(f"🔍 Comparing page {page + 1}")


        # ----- TEXT COMPARISON -----

        text1 = extract_text(pdf1, page)

        text2 = extract_text(pdf2, page)

        diff_text = text_diff(text1, text2)


        text_output = os.path.join(

            OUTPUT_DIR, f"page_{page + 1}_text_diff.txt"

        )

        with open(text_output, "w", encoding="utf-8") as f:

            f.write(diff_text)


        # ----- IMAGE COMPARISON -----

        try:

            img1 = render_page_image(pdf1, page)

            img2 = render_page_image(pdf2, page)


            diff_img = image_diff(img1, img2)

            img_output = os.path.join(

                OUTPUT_DIR, f"page_{page + 1}_image_diff.png"

            )

            diff_img.save(img_output)


        except Exception as e:

            print(f"⚠️ Image diff skipped for page {page + 1}: {e}")


    print("\n✅ PDF comparison complete.")

    print(f"📁 Results saved in: {OUTPUT_DIR}")


# ----------------------------------------------------

# RUN

# ----------------------------------------------------

if __name__ == "__main__":

    pdf_a = input("Enter first PDF path: ").strip()

    pdf_b = input("Enter second PDF path: ").strip()


    compare_pdfs(pdf_a, pdf_b)


No comments:

Post a Comment