Blog Pages

Receipt Text Extractor & Analyzer

 OCR using pytesseract

from PIL import Image

import pytesseract


def extract_text_from_image(image_path):

    image = Image.open(image_path)

    text = pytesseract.image_to_string(image)

    return text

Parse Items and Prices with re

import re

def parse_items(raw_text):
    # Match lines like: "Bread 2.50" or "Milk ....... 1.25"
    pattern = r"([A-Za-z\s]+)\s+([\d]+\.\d{2})"
    matches = re.findall(pattern, raw_text)
    
    items = [{"item": item.strip(), "price": float(price)} for item, price in matches]
    
    total = sum(i["price"] for i in items)
    avg = total / len(items) if items else 0
    
    return items, total, avg

(Optional) Step 3: Streamlit Interface


import streamlit as st
from utils.text_parser import extract_text_from_image, parse_items
import tempfile
import pandas as pd

st.title("๐Ÿงพ Receipt Text Extractor & Analyzer")
uploaded_file = st.file_uploader("Upload Receipt Image", type=["jpg", "png", "jpeg"])

if uploaded_file:
    with tempfile.NamedTemporaryFile(delete=False) as tmp:
        tmp.write(uploaded_file.read())
        tmp_path = tmp.name

    raw_text = extract_text_from_image(tmp_path)
    items, total, avg = parse_items(raw_text)

    df = pd.DataFrame(items)
    st.subheader("๐Ÿ›’ Items Detected:")
    st.table(df)

    st.markdown(f"**Total Cost:** ₹{total:.2f}")
    st.markdown(f"**Average Item Cost:** ₹{avg:.2f}")

    # Download as CSV
    csv = df.to_csv(index=False).encode()
    st.download_button("๐Ÿ“ฅ Download CSV", csv, "receipt_data.csv", "text/csv")

No comments:

Post a Comment