import fitz # PyMuPDF
from PIL import Image, ImageChops, ImageDraw
import difflib
import os
OUTPUT_DIR = "pdf_diff_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ----------------------------------------------------
# Extract text from a PDF page
# ----------------------------------------------------
def extract_text(pdf_path, page_num):
doc = fitz.open(pdf_path)
if page_num >= len(doc):
return ""
return doc[page_num].get_text()
# ----------------------------------------------------
# Render page as image
# ----------------------------------------------------
def render_page_image(pdf_path, page_num, zoom=2):
doc = fitz.open(pdf_path)
page = doc[page_num]
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
return img
# ----------------------------------------------------
# Text Difference
# ----------------------------------------------------
def text_diff(text1, text2):
diff = difflib.unified_diff(
text1.splitlines(),
text2.splitlines(),
lineterm=""
)
return "\n".join(diff)
# ----------------------------------------------------
# Image Difference (highlight changes)
# ----------------------------------------------------
def image_diff(img1, img2):
diff = ImageChops.difference(img1, img2)
diff = diff.convert("RGB")
# Highlight differences in red
pixels = diff.load()
for y in range(diff.height):
for x in range(diff.width):
r, g, b = pixels[x, y]
if r + g + b > 50:
pixels[x, y] = (255, 0, 0)
return diff
# ----------------------------------------------------
# Compare PDFs
# ----------------------------------------------------
def compare_pdfs(pdf1, pdf2):
doc1 = fitz.open(pdf1)
doc2 = fitz.open(pdf2)
total_pages = max(len(doc1), len(doc2))
for page in range(total_pages):
print(f"🔍 Comparing page {page + 1}")
# ----- TEXT COMPARISON -----
text1 = extract_text(pdf1, page)
text2 = extract_text(pdf2, page)
diff_text = text_diff(text1, text2)
text_output = os.path.join(
OUTPUT_DIR, f"page_{page + 1}_text_diff.txt"
)
with open(text_output, "w", encoding="utf-8") as f:
f.write(diff_text)
# ----- IMAGE COMPARISON -----
try:
img1 = render_page_image(pdf1, page)
img2 = render_page_image(pdf2, page)
diff_img = image_diff(img1, img2)
img_output = os.path.join(
OUTPUT_DIR, f"page_{page + 1}_image_diff.png"
)
diff_img.save(img_output)
except Exception as e:
print(f"⚠️ Image diff skipped for page {page + 1}: {e}")
print("\n✅ PDF comparison complete.")
print(f"📁 Results saved in: {OUTPUT_DIR}")
# ----------------------------------------------------
# RUN
# ----------------------------------------------------
if __name__ == "__main__":
pdf_a = input("Enter first PDF path: ").strip()
pdf_b = input("Enter second PDF path: ").strip()
compare_pdfs(pdf_a, pdf_b)
No comments:
Post a Comment