import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import re
import os
# Optional: Set Tesseract path
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def extract_text_from_file(file_path):
if file_path.lower().endswith('.pdf'):
images = convert_from_path(file_path, dpi=300)
text = ""
for image in images:
text += pytesseract.image_to_string(image)
return text
elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
image = Image.open(file_path)
return pytesseract.image_to_string(image)
else:
return ""
def extract_invoice_data(text):
invoice_number = re.search(r'Invoice\s*#?:?\s*(\w+)', text, re.IGNORECASE)
date = re.search(r'Date\s*:?(\s*\d{1,2}/\d{1,2}/\d{2,4})', text, re.IGNORECASE)
total = re.search(r'Total\s*:?[\s$]*(\d+[\.,]?\d*)', text, re.IGNORECASE)
return {
"Invoice Number": invoice_number.group(1) if invoice_number else "Not Found",
"Date": date.group(1).strip() if date else "Not Found",
"Total Amount": total.group(1) if total else "Not Found"
}
def process_invoice():
file_path = filedialog.askopenfilename(title="Select Invoice", filetypes=[("PDF/Image Files", "*.pdf *.jpg *.png *.jpeg")])
if not file_path:
return
text = extract_text_from_file(file_path)
data = extract_invoice_data(text)
result = f"""
๐ File: {os.path.basename(file_path)}
๐งพ Invoice Number: {data['Invoice Number']}
๐ Date: {data['Date']}
๐ฐ Total: {data['Total Amount']}
"""
messagebox.showinfo("Extracted Invoice Data", result)
# GUI Setup
app = tk.Tk()
app.title("๐งพ Invoice Data Extractor")
app.geometry("400x200")
app.configure(bg="#f9f9f9")
label = tk.Label(app, text="Click below to select an invoice PDF or image", bg="#f9f9f9", font=("Helvetica", 12))
label.pack(pady=20)
btn = tk.Button(app, text="Select Invoice", command=process_invoice, bg="#4CAF50", fg="white", font=("Helvetica", 12), padx=10, pady=5)
btn.pack()
app.mainloop()
No comments:
Post a Comment