"""
sheet_to_midi.py
Simple prototype: Convert a scanned single-line, monophonic staff in TREBLE CLEF
to a MIDI file using OpenCV -> heuristic notehead detection -> music21.
Limitations:
- Monophonic, printed notation, single staff detection.
- Treats each notehead as a quarter note by default.
- No clef/key/time signature detection (assumes treble clef, 4/4).
- Not a replacement for full OMR systems like Audiveris.
Usage:
python sheet_to_midi.py input_image.png output.mid
"""
import sys
import cv2
import numpy as np
import math
from music21 import stream, note, midi, tempo, meter
from PIL import Image
# -------------------------
# Utility & image helpers
# -------------------------
def load_image(path):
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
if img is None:
raise FileNotFoundError(f"Cannot open image: {path}")
return img
def binarize(img):
# Adaptive threshold - robust to lighting
th = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY_INV, 15, 10)
return th
# -------------------------
# Staff line detection
# -------------------------
def detect_staff_lines(binary_img, debug=False):
"""
Detect horizontal staff lines using morphological operations and Hough or projection.
Returns list of y-positions of detected lines (sorted).
"""
h, w = binary_img.shape
# Use horizontal morphological kernel to enhance staff lines
horizontal_size = max(10, w // 30)
horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
hor = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, horiz_kernel)
# Sum across columns to get projection
proj = np.sum(hor, axis=1)
# Normalize
proj = (proj - proj.min()) / (proj.max() - proj.min() + 1e-9)
# Find peaks in projection where staff lines are
thresh = 0.15 # tunable
candidates = np.where(proj > thresh)[0]
if len(candidates) == 0:
return []
# Group contiguous regions into single lines (cluster by gaps)
lines = []
current = [candidates[0]]
for r in candidates[1:]:
if r - current[-1] <= 2:
current.append(r)
else:
# average
lines.append(int(np.mean(current)))
current = [r]
if current:
lines.append(int(np.mean(current)))
# Staffs are sets of 5 lines close to each other. Find clusters of 5 lines
# For simplicity, find any groups of 5 lines with roughly equal spacing
# If more than 5 lines are present (multiple staves), return the first 5-line group
if len(lines) < 5:
return lines # fallback
# sliding window of size 5, measure spacing variance
best_group = None
best_score = 1e9
for i in range(0, len(lines) - 4):
group = lines[i:i+5]
spacings = np.diff(group)
score = np.var(spacings) # we want equal spacings
if score < best_score:
best_score = score
best_group = group
if best_group is None:
return lines[:5]
return best_group
# -------------------------
# Note head detection
# -------------------------
def detect_noteheads(binary_img, staff_lines, debug=False):
"""
Detect connected components that look like noteheads.
Return list of bounding boxes (x, y, w, h).
"""
# Remove staff lines from binary image to avoid splitting noteheads:
img_nolines = binary_img.copy()
# Create a mask of lines using morphological ops similar to detection
h, w = binary_img.shape
horizontal_size = max(10, w // 30)
horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
hor = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, horiz_kernel)
img_nolines = cv2.bitwise_and(img_nolines, cv2.bitwise_not(hor))
# Morph close small gaps to make noteheads full blobs
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3,3))
img_nolines = cv2.morphologyEx(img_nolines, cv2.MORPH_CLOSE, kernel, iterations=1)
# Find contours
contours, _ = cv2.findContours(img_nolines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
boxes = []
for cnt in contours:
x, y, wbox, hbox = cv2.boundingRect(cnt)
area = cv2.contourArea(cnt)
# heuristics for notehead sizes
if area < 30: # too small noise
continue
# discard very tall/thin objects (likely stems or flags)
if hbox > 3 * wbox and hbox > 40:
continue
# also discard huge regions (like staff text)
if wbox > binary_img.shape[1] * 0.6:
continue
boxes.append((x, y, wbox, hbox))
# Sort left-to-right
boxes.sort(key=lambda b: b[0])
return boxes
# -------------------------
# Map vertical position to pitch (treble clef)
# -------------------------
def map_y_to_pitch(y_center, staff_lines):
"""
Given y-coordinate and list of 5 staff line y-positions (top->bottom),
compute the pitch name using treble clef mapping.
We'll map lines & spaces to steps; middle C is one ledger line below staff in treble clef.
The mapping: from top line down:
Line 1 (top) -> F5
Space -> E5
Line 2 -> D5
...
We'll build a scale of positions (lines and spaces) with corresponding MIDI note numbers.
"""
# Convert staff_lines sorted top->bottom
lines = sorted(staff_lines)
# staff spacing
spacing = np.median(np.diff(lines))
# Build reference positions: lines and spaces extending several positions above/below
# We'll define positions with index 0 at top line, increasing downward by half-step (line->space->line)
positions = []
labels = [] # MIDI numbers
# Let's compute the center y of each "position" for -6..+12 positions relative to top line
# Determine MIDI mapping: top line (F5) midi 77. Use standard: F5=77, E5=76, D5=74? Wait careful...
# Simpler: define mapping for relative positions using steps in diatonic scale (not semitone), but easiest is map to note names by index:
# We'll build a list of note names starting from some reference. Let's compute using music21 for correctness.
from music21 import pitch
# We'll compute positions: every half staff-step is spacing/2
half = spacing / 2.0
# Let's create position centers from -6 to +18 (enough ledger lines)
pos_centers = [lines[0] - 6*half + i*half for i in range(40)]
# Now assign note names: find which position corresponds to which diatonic step.
# Determine which index corresponds to the top staff line (lines[0])
idx_top_line = int(round((lines[0] - pos_centers[0]) / half))
# For treble clef: top line is F5 (MIDI 77)
top_midi = pitch.Pitch('F5').midi # 77
# Each position step (line->space->line) moves by one diatonic step (i.e., one scale degree), which may be 1 or 2 semitones.
# But easier: we can build a list of midi numbers by moving by semitone steps of a diatonic scale: approximate by mapping every position to midi by using
# semitone step of 1 for each half-step (this maps to chromatic steps which is fine but won't respect staff spacing perfectly for accidentals).
# Simpler: treat each position as semitone steps from top line: top line index -> top_midi, next half position -> top_midi - 1, etc.
# This yields a chromatic mapping: adjacent positions = 1 semitone. This is a simplification (in real staff adjacent positions are diatonic).
midi_for_pos = []
for i in range(len(pos_centers)):
midi_for_pos.append(top_midi - (i - idx_top_line))
# Identify closest pos index for given y_center
diffs = [abs(y_center - c) for c in pos_centers]
pos_idx = int(np.argmin(diffs))
midi = int(round(midi_for_pos[pos_idx]))
# Convert midi to note name
p = pitch.Pitch()
p.midi = midi
return p.nameWithOctave
# -------------------------
# Build music21 stream from detected notes
# -------------------------
def build_stream_from_boxes(boxes, staff_lines, tempo_bpm=100):
s = stream.Stream()
s.append(tempo.MetronomeMark(number=tempo_bpm))
# Simple 4/4 time signature
s.append(meter.TimeSignature('4/4'))
# For each bounding box left->right, map to pitch and create quarter notes
# More advanced: group boxes near same x to chords, or detect stems to find durations (not implemented)
for (x, y, wbox, hbox) in boxes:
cx = x + wbox / 2.0
cy = y + hbox / 2.0
pitch_name = map_y_to_pitch(cy, staff_lines)
n = note.Note(pitch_name)
n.duration.quarterLength = 1.0 # quarter note default
s.append(n)
return s
# -------------------------
# Main flow
# -------------------------
def process_image_to_midi(input_path, output_midi_path, debug=False):
img = load_image(input_path)
bin_img = binarize(img)
staff_lines = detect_staff_lines(bin_img, debug=debug)
if not staff_lines or len(staff_lines) < 5:
print("Warning: could not detect 5 staff lines reliably. Trying to proceed with available lines.")
else:
print("Detected staff lines (y-coordinates):", staff_lines)
boxes = detect_noteheads(bin_img, staff_lines, debug=debug)
if not boxes:
print("No noteheads detected. Exiting.")
return False
print(f"Detected {len(boxes)} candidate noteheads (left→right).")
for i, b in enumerate(boxes, start=1):
x, y, wbox, hbox = b
print(f"{i}: x={x}, y={y}, w={wbox}, h={hbox}")
music_stream = build_stream_from_boxes(boxes, staff_lines, tempo_bpm=100)
# Export to MIDI
mf = midi.translate.streamToMidiFile(music_stream)
mf.open(output_midi_path, 'wb')
mf.write()
mf.close()
print(f"MIDI saved to {output_midi_path}")
return True
# -------------------------
# CLI
# -------------------------
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python sheet_to_midi.py input_image.png output.mid")
sys.exit(1)
inp = sys.argv[1]
out = sys.argv[2]
ok = process_image_to_midi(inp, out, debug=True)
if not ok:
sys.exit(2)
No comments:
Post a Comment