Python for Engineers : Invoice Data Extractor

import tkinter as tk

from tkinter import filedialog, messagebox

from PIL import Image

import pytesseract

from pdf2image import convert_from_path

import re

import os

# Optional: Set Tesseract path

# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_file(file_path):

if file_path.lower().endswith('.pdf'):

images = convert_from_path(file_path, dpi=300)

text = ""

for image in images:

text += pytesseract.image_to_string(image)

return text

elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):

image = Image.open(file_path)

return pytesseract.image_to_string(image)

else:

return ""

def extract_invoice_data(text):

invoice_number = re.search(r'Invoice\s*#?:?\s*(\w+)', text, re.IGNORECASE)

date = re.search(r'Date\s*:?(\s*\d{1,2}/\d{1,2}/\d{2,4})', text, re.IGNORECASE)

total = re.search(r'Total\s*:?[\s$]*(\d+[\.,]?\d*)', text, re.IGNORECASE)

return {

"Invoice Number": invoice_number.group(1) if invoice_number else "Not Found",

"Date": date.group(1).strip() if date else "Not Found",

"Total Amount": total.group(1) if total else "Not Found"

}

def process_invoice():

file_path = filedialog.askopenfilename(title="Select Invoice", filetypes=[("PDF/Image Files", "*.pdf *.jpg *.png *.jpeg")])

if not file_path:

return

text = extract_text_from_file(file_path)

data = extract_invoice_data(text)

result = f"""

📄 File: {os.path.basename(file_path)}

🧾 Invoice Number: {data['Invoice Number']}

📅 Date: {data['Date']}

💰 Total: {data['Total Amount']}

"""

messagebox.showinfo("Extracted Invoice Data", result)

# GUI Setup

app = tk.Tk()

app.title("🧾 Invoice Data Extractor")

app.geometry("400x200")

app.configure(bg="#f9f9f9")

label = tk.Label(app, text="Click below to select an invoice PDF or image", bg="#f9f9f9", font=("Helvetica", 12))

label.pack(pady=20)

btn = tk.Button(app, text="Select Invoice", command=process_invoice, bg="#4CAF50", fg="white", font=("Helvetica", 12), padx=10, pady=5)

btn.pack()

app.mainloop()

Python for Engineers

Blog Pages

Invoice Data Extractor

No comments: