import cv2
import pytesseract
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
# Set tesseract path if needed (Windows users)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Predefined keywords for categorization
CATEGORY_KEYWORDS = {
"Food": ["restaurant", "cafe", "pizza", "burger", "coffee", "food", "dine"],
"Travel": ["uber", "ola", "taxi", "flight", "airlines", "train", "bus", "travel"],
"Utilities": ["electricity", "water", "gas", "internet", "wifi", "bill", "utility"],
"Shopping": ["mall", "store", "supermarket", "shopping", "market", "groceries"],
"Other": []
}
def preprocess_image(image_path):
"""Convert image to grayscale and apply threshold for better OCR results."""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
return gray
def extract_text(image):
"""Extract text from the image using pytesseract."""
return pytesseract.image_to_string(image)
def categorize_expense(text):
"""Categorize based on keywords found in the text."""
text_lower = text.lower()
for category, keywords in CATEGORY_KEYWORDS.items():
for word in keywords:
if word in text_lower:
return category
return "Other"
def extract_amount(text):
"""Find the largest number in text assuming it's the total amount."""
amounts = re.findall(r"\d+\.\d{2}", text)
if amounts:
return max(map(float, amounts))
return 0.0
def process_receipts(folder_path):
"""Process all receipt images in a folder."""
records = []
for file in os.listdir(folder_path):
if file.lower().endswith((".png", ".jpg", ".jpeg")):
img_path = os.path.join(folder_path, file)
pre_img = preprocess_image(img_path)
text = extract_text(pre_img)
category = categorize_expense(text)
amount = extract_amount(text)
records.append({"File": file, "Category": category, "Amount": amount, "Text": text})
return pd.DataFrame(records)
def plot_expenses(df):
"""Plot expenses by category."""
category_totals = df.groupby("Category")["Amount"].sum()
category_totals.plot(kind="bar", color="skyblue")
plt.title("Expenses by Category")
plt.xlabel("Category")
plt.ylabel("Total Amount")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
if __name__ == "__main__":
folder = "receipts" # Folder containing receipt images
df = process_receipts(folder)
print(df)
plot_expenses(df)
No comments:
Post a Comment