Python for Engineers : Smart Receipt Scanner & Expense Categorizer

import cv2

import pytesseract

import pandas as pd

import matplotlib.pyplot as plt

import re

import os

# Set tesseract path if needed (Windows users)

# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Predefined keywords for categorization

CATEGORY_KEYWORDS = {

"Food": ["restaurant", "cafe", "pizza", "burger", "coffee", "food", "dine"],

"Travel": ["uber", "ola", "taxi", "flight", "airlines", "train", "bus", "travel"],

"Utilities": ["electricity", "water", "gas", "internet", "wifi", "bill", "utility"],

"Shopping": ["mall", "store", "supermarket", "shopping", "market", "groceries"],

"Other": []

}

def preprocess_image(image_path):

"""Convert image to grayscale and apply threshold for better OCR results."""

img = cv2.imread(image_path)

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

return gray

def extract_text(image):

"""Extract text from the image using pytesseract."""

return pytesseract.image_to_string(image)

def categorize_expense(text):

"""Categorize based on keywords found in the text."""

text_lower = text.lower()

for category, keywords in CATEGORY_KEYWORDS.items():

for word in keywords:

if word in text_lower:

return category

return "Other"

def extract_amount(text):

"""Find the largest number in text assuming it's the total amount."""

amounts = re.findall(r"\d+\.\d{2}", text)

if amounts:

return max(map(float, amounts))

return 0.0

def process_receipts(folder_path):

"""Process all receipt images in a folder."""

records = []

for file in os.listdir(folder_path):

if file.lower().endswith((".png", ".jpg", ".jpeg")):

img_path = os.path.join(folder_path, file)

pre_img = preprocess_image(img_path)

text = extract_text(pre_img)

category = categorize_expense(text)

amount = extract_amount(text)

records.append({"File": file, "Category": category, "Amount": amount, "Text": text})

return pd.DataFrame(records)

def plot_expenses(df):

"""Plot expenses by category."""

category_totals = df.groupby("Category")["Amount"].sum()

category_totals.plot(kind="bar", color="skyblue")

plt.title("Expenses by Category")

plt.xlabel("Category")

plt.ylabel("Total Amount")

plt.xticks(rotation=45)

plt.tight_layout()

plt.show()

if __name__ == "__main__":

folder = "receipts" # Folder containing receipt images

df = process_receipts(folder)

print(df)

plot_expenses(df)

Python for Engineers

Blog Pages

Smart Receipt Scanner & Expense Categorizer

No comments: