#!/usr/bin/env python3
"""
AI Whiteboard Digitizer (prototype)
Usage:
python whiteboard_digitizer.py input_image.jpg
Outputs:
- ocr_texts.txt : OCR'd text lines (raw)
- equations_latex.tex : LaTeX for parseable math expressions
- diagram.svg (or diagram.png) : vector-like rendering of detected lines/circles
- several debug images in ./debug_*.png
"""
import sys
import os
import cv2
import numpy as np
from PIL import Image
import pytesseract
from sympy import sympify, latex
from sympy.core.sympify import SympifyError
import svgwrite
import matplotlib.pyplot as plt
# If on Windows and tesseract is not in PATH, set path here (uncomment and adjust)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# ---------- Utilities ----------
def ensure_dir(path):
if not os.path.exists(path):
os.makedirs(path)
# Preprocess: grayscale, denoise, adaptive threshold, deskew
def preprocess_image(img_bgr, max_dim=1600):
# Resize to manageable size, keep aspect
h, w = img_bgr.shape[:2]
scale = min(1.0, float(max_dim) / max(h, w))
if scale != 1.0:
img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# Denoise
gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# Bilateral to preserve edges
gray = cv2.bilateralFilter(gray, 9, 75, 75)
# Adaptive threshold (whiteboard: dark text on light background)
th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 25, 12)
# Deskew based on largest text contours or Hough lines
coords = np.column_stack(np.where(th > 0))
if coords.shape[0] > 0:
angle = cv2.minAreaRect(coords)[-1]
# Correction of angle
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
# Rotate
(h2, w2) = gray.shape[:2]
M = cv2.getRotationMatrix2D((w2//2, h2//2), angle, 1.0)
gray = cv2.warpAffine(gray, M, (w2, h2), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
th = cv2.warpAffine(th, M, (w2, h2), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return gray, th
# Find text boxes using MSER or connected components (we'll use morphological dilation + contours)
def detect_text_regions(thresh_img, min_area=200, debug_out=None):
# dilate to join letters into words/lines
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
dil = cv2.dilate(thresh_img, kernel, iterations=2)
contours, _ = cv2.findContours(dil, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
boxes = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
if w*h < min_area:
continue
# filter very tall or very wide noise
if h < 10 or w < 20:
continue
boxes.append((x, y, w, h))
# sort top to bottom, left to right
boxes = sorted(boxes, key=lambda b: (b[1], b[0]))
if debug_out is not None:
vis = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
for (x,y,w,h) in boxes:
cv2.rectangle(vis, (x,y), (x+w,y+h), (0,255,0), 2)
cv2.imwrite(debug_out, vis)
return boxes
# OCR each region (use appropriate psm)
def ocr_regions(gray_img, boxes, ocr_lang='eng'):
lines = []
for (x, y, w, h) in boxes:
pad = 4
x0 = max(0, x-pad)
y0 = max(0, y-pad)
x1 = min(gray_img.shape[1], x+w+pad)
y1 = min(gray_img.shape[0], y+h+pad)
crop = gray_img[y0:y1, x0:x1]
# increase contrast and invert if necessary
# Convert to PIL for pytesseract
pil = Image.fromarray(crop)
# Tesseract config: treat as a single line or single block
config = "--psm 7" # treat as a single text line (good for equations on a line)
text = pytesseract.image_to_string(pil, lang=ocr_lang, config=config)
text = text.strip()
if text:
lines.append({'box': (x0,y0,x1,y1), 'text': text})
return lines
# Heuristic to check if a line looks like an equation/expression
def looks_like_equation(s):
# Accept digits, letters, operators and = ^ / * + - parentheses, fractions-like '/', greek? etc.
import re
s2 = s.replace(' ', '')
# Must have at least one operator or equal sign or variable
if re.search(r'[=\+\-\*/\^]', s2):
return True
# Or something like 'lim', 'sin', 'cos' etc.
if re.search(r'\b(sin|cos|tan|log|ln|lim|sqrt)\b', s.lower()):
return True
# Or presence of digits next to letters (like 2x or x2)
if re.search(r'\d+[a-zA-Z]|[a-zA-Z]\d+', s):
return True
return False
# Clean OCR text for sympy: replace common OCR artifacts
def clean_ocr_for_sympy(s):
# Basic replacements; adapt as needed
repl = {
'×': '*',
'X': 'x',
'—': '-',
'−': '-',
'–': '-',
'÷': '/',
'’': "'",
'‘': "'",
'“': '"',
'”': '"',
'O': '0', # risky: only if look like zero
}
out = s
# Remove stray non-ascii except math symbols
for k,v in repl.items():
out = out.replace(k, v)
# convert superscript-like to **: e.g., x^2 -> x**2 (keep ^ too)
out = out.replace('^', '**')
# Remove weird characters
allowed = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ+-*/=().,** _^"
# keep letters and mathematical common symbols; but avoid removing letters like greek
# Simpler: strip only control characters
out = ''.join(ch for ch in out if (ch.isprintable()))
out = out.strip()
return out
# Try to parse with sympy and produce latex
def parse_equation_to_latex(s):
s_clean = clean_ocr_for_sympy(s)
# If contains '=' treat as equation; else expression
try:
if '=' in s_clean:
# sympy's Eq expects left and right; split on first '='
left, right = s_clean.split('=', 1)
eleft = sympify(left)
eright = sympify(right)
eq = eleft - eright # expression equal zero
# represent as LaTeX equation
latex_str = latex(eleft) + " = " + latex(eright)
else:
expr = sympify(s_clean)
latex_str = latex(expr)
return latex_str, None
except SympifyError as e:
return None, f"SympifyError: {e}"
except Exception as e:
return None, str(e)
# Detect simple geometric primitives (lines via Hough, circles via HoughCircles)
def detect_shapes(gray_img, debug_prefix=None):
# Use edge detection
edges = cv2.Canny(gray_img, 50, 150, apertureSize=3)
# Hough lines
lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=80, minLineLength=50, maxLineGap=10)
line_list = []
if lines is not None:
for l in lines:
x1,y1,x2,y2 = l[0]
line_list.append((int(x1),int(y1),int(x2),int(y2)))
# Hough circles
circles = None
try:
circ = cv2.HoughCircles(gray_img, cv2.HOUGH_GRADIENT, dp=1.2, minDist=30,
param1=100, param2=30, minRadius=8, maxRadius=200)
if circ is not None:
circ = np.round(circ[0, :]).astype("int")
circles = [(int(x),int(y),int(r)) for (x,y,r) in circ]
except Exception:
circles = None
# Save debug
if debug_prefix:
vis = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2BGR)
for (x1,y1,x2,y2) in line_list:
cv2.line(vis, (x1,y1), (x2,y2), (0,255,0), 2)
if circles:
for (x,y,r) in circles:
cv2.circle(vis, (x,y), r, (0,0,255), 2)
cv2.imwrite(f"{debug_prefix}_shapes.png", vis)
return line_list, circles
# Render vector-style diagram to SVG using svgwrite or Matplotlib
def render_vector_diagram(svg_path, image_size, lines, circles, boxes=None):
w, h = image_size
dwg = svgwrite.Drawing(svg_path, size=(w, h))
# background white
dwg.add(dwg.rect(insert=(0,0), size=(w,h), fill='white'))
# optional: draw boxes for text regions (thin gray)
if boxes:
for (x0,y0,x1,y1) in boxes:
dwg.add(dwg.rect(insert=(x0,y0), size=(x1-x0,y1-y0), fill='none', stroke='lightgray', stroke_width=1))
# draw lines
for (x1,y1,x2,y2) in lines:
dwg.add(dwg.line(start=(x1,y1), end=(x2,y2), stroke=svgwrite.rgb(10, 10, 16, '%'), stroke_width=2))
# draw circles
if circles:
for (x,y,r) in circles:
dwg.add(dwg.circle(center=(x,y), r=r, stroke='black', fill='none', stroke_width=2))
dwg.save()
# Main pipeline
def process_whiteboard_image(in_path, out_dir="wb_outputs"):
ensure_dir(out_dir)
img_bgr = cv2.imread(in_path)
if img_bgr is None:
raise FileNotFoundError(in_path)
gray, th = preprocess_image(img_bgr)
debug_pre = os.path.join(out_dir, "debug_preprocess.png")
cv2.imwrite(debug_pre, gray)
cv2.imwrite(os.path.join(out_dir, "debug_thresh.png"), th)
# detect regions (text lines)
boxes = detect_text_regions(th, debug_out=os.path.join(out_dir, "debug_boxes.png"))
# OCR
ocr_lines = ocr_regions(gray, boxes)
# write OCR results
ocr_txt_file = os.path.join(out_dir, "ocr_texts.txt")
with open(ocr_txt_file, "w", encoding='utf-8') as f:
for item in ocr_lines:
f.write(item['text'] + "\n")
print(f"[+] OCR lines saved to {ocr_txt_file}")
# Filter likely equations
eq_candidates = [it for it in ocr_lines if looks_like_equation(it['text'])]
latex_results = []
for it in eq_candidates:
txt = it['text']
latex_str, err = parse_equation_to_latex(txt)
if latex_str:
latex_results.append((txt, latex_str))
else:
latex_results.append((txt, f"UNPARSEABLE: {err}"))
# write equations latex
eq_file = os.path.join(out_dir, "equations_latex.tex")
with open(eq_file, "w", encoding='utf-8') as f:
f.write("% Generated LaTeX (auto) — review and correct as needed\n")
for orig, out in latex_results:
f.write("% OCR: " + orig.replace("\n"," ") + "\n")
f.write(out + "\n\n")
print(f"[+] Equation LaTeX saved to {eq_file}")
# shape detection
lines, circles = detect_shapes(gray, debug_prefix=os.path.join(out_dir, "debug"))
svg_path = os.path.join(out_dir, "diagram.svg")
# convert boxes to x0,y0,x1,y1 format for svg (optional)
bboxes = [(x,y,x+w,y+h) for (x,y,w,h) in boxes]
render_vector_diagram(svg_path, (gray.shape[1], gray.shape[0]), lines, circles, boxes=bboxes)
print(f"[+] Diagram SVG saved to {svg_path}")
# Also produce a matplotlib PNG overlay visualization
overlay = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
for (x1,y1,x2,y2) in lines:
cv2.line(overlay, (x1,y1), (x2,y2), (0,255,0), 2)
if circles:
for (x,y,r) in circles:
cv2.circle(overlay, (x,y), r, (0,0,255), 2)
for (x0,y0,x1,y1) in bboxes:
cv2.rectangle(overlay, (x0,y0), (x1,y1), (255,0,0), 1)
cv2.imwrite(os.path.join(out_dir, "overlay_debug.png"), overlay)
print(f"[+] Debug overlay saved to {os.path.join(out_dir, 'overlay_debug.png')}")
return {
"ocr_lines": ocr_lines,
"equations": latex_results,
"svg": svg_path,
"debug": out_dir
}
# --------- CLI ----------
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python whiteboard_digitizer.py input_image.jpg")
sys.exit(1)
inp = sys.argv[1]
out = "wb_outputs"
res = process_whiteboard_image(inp, out_dir=out)
print("Done. Outputs in:", out)
No comments:
Post a Comment