AI Whiteboard Digitizer

#!/usr/bin/env python3

"""

AI Whiteboard Digitizer (prototype)


Usage:

    python whiteboard_digitizer.py input_image.jpg


Outputs:

 - ocr_texts.txt          : OCR'd text lines (raw)

 - equations_latex.tex    : LaTeX for parseable math expressions

 - diagram.svg (or diagram.png) : vector-like rendering of detected lines/circles

 - several debug images in ./debug_*.png

"""


import sys

import os

import cv2

import numpy as np

from PIL import Image

import pytesseract

from sympy import sympify, latex

from sympy.core.sympify import SympifyError

import svgwrite

import matplotlib.pyplot as plt


# If on Windows and tesseract is not in PATH, set path here (uncomment and adjust)

# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


# ---------- Utilities ----------

def ensure_dir(path):

    if not os.path.exists(path):

        os.makedirs(path)


# Preprocess: grayscale, denoise, adaptive threshold, deskew

def preprocess_image(img_bgr, max_dim=1600):

    # Resize to manageable size, keep aspect

    h, w = img_bgr.shape[:2]

    scale = min(1.0, float(max_dim) / max(h, w))

    if scale != 1.0:

        img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)

    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    # Denoise

    gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

    # Bilateral to preserve edges

    gray = cv2.bilateralFilter(gray, 9, 75, 75)

    # Adaptive threshold (whiteboard: dark text on light background)

    th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,

                               cv2.THRESH_BINARY_INV, 25, 12)

    # Deskew based on largest text contours or Hough lines

    coords = np.column_stack(np.where(th > 0))

    if coords.shape[0] > 0:

        angle = cv2.minAreaRect(coords)[-1]

        # Correction of angle

        if angle < -45:

            angle = -(90 + angle)

        else:

            angle = -angle

        # Rotate

        (h2, w2) = gray.shape[:2]

        M = cv2.getRotationMatrix2D((w2//2, h2//2), angle, 1.0)

        gray = cv2.warpAffine(gray, M, (w2, h2), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

        th = cv2.warpAffine(th, M, (w2, h2), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    return gray, th


# Find text boxes using MSER or connected components (we'll use morphological dilation + contours)

def detect_text_regions(thresh_img, min_area=200, debug_out=None):

    # dilate to join letters into words/lines

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))

    dil = cv2.dilate(thresh_img, kernel, iterations=2)

    contours, _ = cv2.findContours(dil, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    boxes = []

    for cnt in contours:

        x, y, w, h = cv2.boundingRect(cnt)

        if w*h < min_area:

            continue

        # filter very tall or very wide noise

        if h < 10 or w < 20:

            continue

        boxes.append((x, y, w, h))

    # sort top to bottom, left to right

    boxes = sorted(boxes, key=lambda b: (b[1], b[0]))

    if debug_out is not None:

        vis = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)

        for (x,y,w,h) in boxes:

            cv2.rectangle(vis, (x,y), (x+w,y+h), (0,255,0), 2)

        cv2.imwrite(debug_out, vis)

    return boxes


# OCR each region (use appropriate psm)

def ocr_regions(gray_img, boxes, ocr_lang='eng'):

    lines = []

    for (x, y, w, h) in boxes:

        pad = 4

        x0 = max(0, x-pad)

        y0 = max(0, y-pad)

        x1 = min(gray_img.shape[1], x+w+pad)

        y1 = min(gray_img.shape[0], y+h+pad)

        crop = gray_img[y0:y1, x0:x1]

        # increase contrast and invert if necessary

        # Convert to PIL for pytesseract

        pil = Image.fromarray(crop)

        # Tesseract config: treat as a single line or single block

        config = "--psm 7"  # treat as a single text line (good for equations on a line)

        text = pytesseract.image_to_string(pil, lang=ocr_lang, config=config)

        text = text.strip()

        if text:

            lines.append({'box': (x0,y0,x1,y1), 'text': text})

    return lines


# Heuristic to check if a line looks like an equation/expression

def looks_like_equation(s):

    # Accept digits, letters, operators and = ^ / * + - parentheses, fractions-like '/', greek? etc.

    import re

    s2 = s.replace(' ', '')

    # Must have at least one operator or equal sign or variable

    if re.search(r'[=\+\-\*/\^]', s2):

        return True

    # Or something like 'lim', 'sin', 'cos' etc.

    if re.search(r'\b(sin|cos|tan|log|ln|lim|sqrt)\b', s.lower()):

        return True

    # Or presence of digits next to letters (like 2x or x2)

    if re.search(r'\d+[a-zA-Z]|[a-zA-Z]\d+', s):

        return True

    return False


# Clean OCR text for sympy: replace common OCR artifacts

def clean_ocr_for_sympy(s):

    # Basic replacements; adapt as needed

    repl = {

        '×': '*',

        'X': 'x',

        '—': '-',

        '−': '-',

        '–': '-',

        '÷': '/',

        '’': "'",

        '‘': "'",

        '“': '"',

        '”': '"',

        'O': '0',  # risky: only if look like zero

    }

    out = s

    # Remove stray non-ascii except math symbols

    for k,v in repl.items():

        out = out.replace(k, v)

    # convert superscript-like to **: e.g., x^2 -> x**2 (keep ^ too)

    out = out.replace('^', '**')

    # Remove weird characters

    allowed = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ+-*/=().,** _^"

    # keep letters and mathematical common symbols; but avoid removing letters like greek

    # Simpler: strip only control characters

    out = ''.join(ch for ch in out if (ch.isprintable()))

    out = out.strip()

    return out


# Try to parse with sympy and produce latex

def parse_equation_to_latex(s):

    s_clean = clean_ocr_for_sympy(s)

    # If contains '=' treat as equation; else expression

    try:

        if '=' in s_clean:

            # sympy's Eq expects left and right; split on first '='

            left, right = s_clean.split('=', 1)

            eleft = sympify(left)

            eright = sympify(right)

            eq = eleft - eright  # expression equal zero

            # represent as LaTeX equation

            latex_str = latex(eleft) + " = " + latex(eright)

        else:

            expr = sympify(s_clean)

            latex_str = latex(expr)

        return latex_str, None

    except SympifyError as e:

        return None, f"SympifyError: {e}"

    except Exception as e:

        return None, str(e)


# Detect simple geometric primitives (lines via Hough, circles via HoughCircles)

def detect_shapes(gray_img, debug_prefix=None):

    # Use edge detection

    edges = cv2.Canny(gray_img, 50, 150, apertureSize=3)

    # Hough lines

    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=80, minLineLength=50, maxLineGap=10)

    line_list = []

    if lines is not None:

        for l in lines:

            x1,y1,x2,y2 = l[0]

            line_list.append((int(x1),int(y1),int(x2),int(y2)))

    # Hough circles

    circles = None

    try:

        circ = cv2.HoughCircles(gray_img, cv2.HOUGH_GRADIENT, dp=1.2, minDist=30,

                                param1=100, param2=30, minRadius=8, maxRadius=200)

        if circ is not None:

            circ = np.round(circ[0, :]).astype("int")

            circles = [(int(x),int(y),int(r)) for (x,y,r) in circ]

    except Exception:

        circles = None

    # Save debug

    if debug_prefix:

        vis = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2BGR)

        for (x1,y1,x2,y2) in line_list:

            cv2.line(vis, (x1,y1), (x2,y2), (0,255,0), 2)

        if circles:

            for (x,y,r) in circles:

                cv2.circle(vis, (x,y), r, (0,0,255), 2)

        cv2.imwrite(f"{debug_prefix}_shapes.png", vis)

    return line_list, circles


# Render vector-style diagram to SVG using svgwrite or Matplotlib

def render_vector_diagram(svg_path, image_size, lines, circles, boxes=None):

    w, h = image_size

    dwg = svgwrite.Drawing(svg_path, size=(w, h))

    # background white

    dwg.add(dwg.rect(insert=(0,0), size=(w,h), fill='white'))

    # optional: draw boxes for text regions (thin gray)

    if boxes:

        for (x0,y0,x1,y1) in boxes:

            dwg.add(dwg.rect(insert=(x0,y0), size=(x1-x0,y1-y0), fill='none', stroke='lightgray', stroke_width=1))

    # draw lines

    for (x1,y1,x2,y2) in lines:

        dwg.add(dwg.line(start=(x1,y1), end=(x2,y2), stroke=svgwrite.rgb(10, 10, 16, '%'), stroke_width=2))

    # draw circles

    if circles:

        for (x,y,r) in circles:

            dwg.add(dwg.circle(center=(x,y), r=r, stroke='black', fill='none', stroke_width=2))

    dwg.save()


# Main pipeline

def process_whiteboard_image(in_path, out_dir="wb_outputs"):

    ensure_dir(out_dir)

    img_bgr = cv2.imread(in_path)

    if img_bgr is None:

        raise FileNotFoundError(in_path)

    gray, th = preprocess_image(img_bgr)

    debug_pre = os.path.join(out_dir, "debug_preprocess.png")

    cv2.imwrite(debug_pre, gray)

    cv2.imwrite(os.path.join(out_dir, "debug_thresh.png"), th)


    # detect regions (text lines)

    boxes = detect_text_regions(th, debug_out=os.path.join(out_dir, "debug_boxes.png"))

    # OCR

    ocr_lines = ocr_regions(gray, boxes)

    # write OCR results

    ocr_txt_file = os.path.join(out_dir, "ocr_texts.txt")

    with open(ocr_txt_file, "w", encoding='utf-8') as f:

        for item in ocr_lines:

            f.write(item['text'] + "\n")

    print(f"[+] OCR lines saved to {ocr_txt_file}")


    # Filter likely equations

    eq_candidates = [it for it in ocr_lines if looks_like_equation(it['text'])]

    latex_results = []

    for it in eq_candidates:

        txt = it['text']

        latex_str, err = parse_equation_to_latex(txt)

        if latex_str:

            latex_results.append((txt, latex_str))

        else:

            latex_results.append((txt, f"UNPARSEABLE: {err}"))

    # write equations latex

    eq_file = os.path.join(out_dir, "equations_latex.tex")

    with open(eq_file, "w", encoding='utf-8') as f:

        f.write("% Generated LaTeX (auto) — review and correct as needed\n")

        for orig, out in latex_results:

            f.write("% OCR: " + orig.replace("\n"," ") + "\n")

            f.write(out + "\n\n")

    print(f"[+] Equation LaTeX saved to {eq_file}")


    # shape detection

    lines, circles = detect_shapes(gray, debug_prefix=os.path.join(out_dir, "debug"))

    svg_path = os.path.join(out_dir, "diagram.svg")

    # convert boxes to x0,y0,x1,y1 format for svg (optional)

    bboxes = [(x,y,x+w,y+h) for (x,y,w,h) in boxes]

    render_vector_diagram(svg_path, (gray.shape[1], gray.shape[0]), lines, circles, boxes=bboxes)

    print(f"[+] Diagram SVG saved to {svg_path}")


    # Also produce a matplotlib PNG overlay visualization

    overlay = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)

    for (x1,y1,x2,y2) in lines:

        cv2.line(overlay, (x1,y1), (x2,y2), (0,255,0), 2)

    if circles:

        for (x,y,r) in circles:

            cv2.circle(overlay, (x,y), r, (0,0,255), 2)

    for (x0,y0,x1,y1) in bboxes:

        cv2.rectangle(overlay, (x0,y0), (x1,y1), (255,0,0), 1)

    cv2.imwrite(os.path.join(out_dir, "overlay_debug.png"), overlay)

    print(f"[+] Debug overlay saved to {os.path.join(out_dir, 'overlay_debug.png')}")

    return {

        "ocr_lines": ocr_lines,

        "equations": latex_results,

        "svg": svg_path,

        "debug": out_dir

    }


# --------- CLI ----------

if __name__ == "__main__":

    if len(sys.argv) < 2:

        print("Usage: python whiteboard_digitizer.py input_image.jpg")

        sys.exit(1)

    inp = sys.argv[1]

    out = "wb_outputs"

    res = process_whiteboard_image(inp, out_dir=out)

    print("Done. Outputs in:", out)


No comments: