Smart Receipt Scanner & Expense Categorizer

import cv2

import pytesseract

import pandas as pd

import matplotlib.pyplot as plt

import re

import os


# Set tesseract path if needed (Windows users)

# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


# Predefined keywords for categorization

CATEGORY_KEYWORDS = {

    "Food": ["restaurant", "cafe", "pizza", "burger", "coffee", "food", "dine"],

    "Travel": ["uber", "ola", "taxi", "flight", "airlines", "train", "bus", "travel"],

    "Utilities": ["electricity", "water", "gas", "internet", "wifi", "bill", "utility"],

    "Shopping": ["mall", "store", "supermarket", "shopping", "market", "groceries"],

    "Other": []

}


def preprocess_image(image_path):

    """Convert image to grayscale and apply threshold for better OCR results."""

    img = cv2.imread(image_path)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

    return gray


def extract_text(image):

    """Extract text from the image using pytesseract."""

    return pytesseract.image_to_string(image)


def categorize_expense(text):

    """Categorize based on keywords found in the text."""

    text_lower = text.lower()

    for category, keywords in CATEGORY_KEYWORDS.items():

        for word in keywords:

            if word in text_lower:

                return category

    return "Other"


def extract_amount(text):

    """Find the largest number in text assuming it's the total amount."""

    amounts = re.findall(r"\d+\.\d{2}", text)

    if amounts:

        return max(map(float, amounts))

    return 0.0


def process_receipts(folder_path):

    """Process all receipt images in a folder."""

    records = []

    for file in os.listdir(folder_path):

        if file.lower().endswith((".png", ".jpg", ".jpeg")):

            img_path = os.path.join(folder_path, file)

            pre_img = preprocess_image(img_path)

            text = extract_text(pre_img)

            category = categorize_expense(text)

            amount = extract_amount(text)

            records.append({"File": file, "Category": category, "Amount": amount, "Text": text})

    return pd.DataFrame(records)


def plot_expenses(df):

    """Plot expenses by category."""

    category_totals = df.groupby("Category")["Amount"].sum()

    category_totals.plot(kind="bar", color="skyblue")

    plt.title("Expenses by Category")

    plt.xlabel("Category")

    plt.ylabel("Total Amount")

    plt.xticks(rotation=45)

    plt.tight_layout()

    plt.show()


if __name__ == "__main__":

    folder = "receipts"  # Folder containing receipt images

    df = process_receipts(folder)

    print(df)

    plot_expenses(df)


No comments: