Python for Engineers : Music Sheet to Audio Converter

"""

sheet_to_midi.py

Simple prototype: Convert a scanned single-line, monophonic staff in TREBLE CLEF

to a MIDI file using OpenCV -> heuristic notehead detection -> music21.

Limitations:

- Monophonic, printed notation, single staff detection.

- Treats each notehead as a quarter note by default.

- No clef/key/time signature detection (assumes treble clef, 4/4).

- Not a replacement for full OMR systems like Audiveris.

Usage:

python sheet_to_midi.py input_image.png output.mid

"""

import sys

import cv2

import numpy as np

import math

from music21 import stream, note, midi, tempo, meter

from PIL import Image

# -------------------------

# Utility & image helpers

# -------------------------

def load_image(path):

img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)

if img is None:

raise FileNotFoundError(f"Cannot open image: {path}")

return img

def binarize(img):

# Adaptive threshold - robust to lighting

th = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,

cv2.THRESH_BINARY_INV, 15, 10)

return th

# -------------------------

# Staff line detection

# -------------------------

def detect_staff_lines(binary_img, debug=False):

"""

Detect horizontal staff lines using morphological operations and Hough or projection.

Returns list of y-positions of detected lines (sorted).

"""

h, w = binary_img.shape

# Use horizontal morphological kernel to enhance staff lines

horizontal_size = max(10, w // 30)

horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))

hor = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, horiz_kernel)

# Sum across columns to get projection

proj = np.sum(hor, axis=1)

# Normalize

proj = (proj - proj.min()) / (proj.max() - proj.min() + 1e-9)

# Find peaks in projection where staff lines are

thresh = 0.15 # tunable

candidates = np.where(proj > thresh)[0]

if len(candidates) == 0:

return []

# Group contiguous regions into single lines (cluster by gaps)

lines = []

current = [candidates[0]]

for r in candidates[1:]:

if r - current[-1] <= 2:

current.append(r)

else:

# average

lines.append(int(np.mean(current)))

current = [r]

if current:

lines.append(int(np.mean(current)))

# Staffs are sets of 5 lines close to each other. Find clusters of 5 lines

# For simplicity, find any groups of 5 lines with roughly equal spacing

# If more than 5 lines are present (multiple staves), return the first 5-line group

if len(lines) < 5:

return lines # fallback

# sliding window of size 5, measure spacing variance

best_group = None

best_score = 1e9

for i in range(0, len(lines) - 4):

group = lines[i:i+5]

spacings = np.diff(group)

score = np.var(spacings) # we want equal spacings

if score < best_score:

best_score = score

best_group = group

if best_group is None:

return lines[:5]

return best_group

# -------------------------

# Note head detection

# -------------------------

def detect_noteheads(binary_img, staff_lines, debug=False):

"""

Detect connected components that look like noteheads.

Return list of bounding boxes (x, y, w, h).

"""

# Remove staff lines from binary image to avoid splitting noteheads:

img_nolines = binary_img.copy()

# Create a mask of lines using morphological ops similar to detection

h, w = binary_img.shape

horizontal_size = max(10, w // 30)

horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))

hor = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, horiz_kernel)

img_nolines = cv2.bitwise_and(img_nolines, cv2.bitwise_not(hor))

# Morph close small gaps to make noteheads full blobs

kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3,3))

img_nolines = cv2.morphologyEx(img_nolines, cv2.MORPH_CLOSE, kernel, iterations=1)

# Find contours

contours, _ = cv2.findContours(img_nolines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

boxes = []

for cnt in contours:

x, y, wbox, hbox = cv2.boundingRect(cnt)

area = cv2.contourArea(cnt)

# heuristics for notehead sizes

if area < 30: # too small noise

continue

# discard very tall/thin objects (likely stems or flags)

if hbox > 3 * wbox and hbox > 40:

continue

# also discard huge regions (like staff text)

if wbox > binary_img.shape[1] * 0.6:

continue

boxes.append((x, y, wbox, hbox))

# Sort left-to-right

boxes.sort(key=lambda b: b[0])

return boxes

# -------------------------

# Map vertical position to pitch (treble clef)

# -------------------------

def map_y_to_pitch(y_center, staff_lines):

"""

Given y-coordinate and list of 5 staff line y-positions (top->bottom),

compute the pitch name using treble clef mapping.

We'll map lines & spaces to steps; middle C is one ledger line below staff in treble clef.

The mapping: from top line down:

Line 1 (top) -> F5

Space -> E5

Line 2 -> D5

...

We'll build a scale of positions (lines and spaces) with corresponding MIDI note numbers.

"""

# Convert staff_lines sorted top->bottom

lines = sorted(staff_lines)

# staff spacing

spacing = np.median(np.diff(lines))

# Build reference positions: lines and spaces extending several positions above/below

# We'll define positions with index 0 at top line, increasing downward by half-step (line->space->line)

positions = []

labels = [] # MIDI numbers

# Let's compute the center y of each "position" for -6..+12 positions relative to top line

# Determine MIDI mapping: top line (F5) midi 77. Use standard: F5=77, E5=76, D5=74? Wait careful...

# Simpler: define mapping for relative positions using steps in diatonic scale (not semitone), but easiest is map to note names by index:

# We'll build a list of note names starting from some reference. Let's compute using music21 for correctness.

from music21 import pitch

# We'll compute positions: every half staff-step is spacing/2

half = spacing / 2.0

# Let's create position centers from -6 to +18 (enough ledger lines)

pos_centers = [lines[0] - 6*half + i*half for i in range(40)]

# Now assign note names: find which position corresponds to which diatonic step.

# Determine which index corresponds to the top staff line (lines[0])

idx_top_line = int(round((lines[0] - pos_centers[0]) / half))

# For treble clef: top line is F5 (MIDI 77)

top_midi = pitch.Pitch('F5').midi # 77

# Each position step (line->space->line) moves by one diatonic step (i.e., one scale degree), which may be 1 or 2 semitones.

# But easier: we can build a list of midi numbers by moving by semitone steps of a diatonic scale: approximate by mapping every position to midi by using

# semitone step of 1 for each half-step (this maps to chromatic steps which is fine but won't respect staff spacing perfectly for accidentals).

# Simpler: treat each position as semitone steps from top line: top line index -> top_midi, next half position -> top_midi - 1, etc.

# This yields a chromatic mapping: adjacent positions = 1 semitone. This is a simplification (in real staff adjacent positions are diatonic).

midi_for_pos = []

for i in range(len(pos_centers)):

midi_for_pos.append(top_midi - (i - idx_top_line))

# Identify closest pos index for given y_center

diffs = [abs(y_center - c) for c in pos_centers]

pos_idx = int(np.argmin(diffs))

midi = int(round(midi_for_pos[pos_idx]))

# Convert midi to note name

p = pitch.Pitch()

p.midi = midi

return p.nameWithOctave

# -------------------------

# Build music21 stream from detected notes

# -------------------------

def build_stream_from_boxes(boxes, staff_lines, tempo_bpm=100):

s = stream.Stream()

s.append(tempo.MetronomeMark(number=tempo_bpm))

# Simple 4/4 time signature

s.append(meter.TimeSignature('4/4'))

# For each bounding box left->right, map to pitch and create quarter notes

# More advanced: group boxes near same x to chords, or detect stems to find durations (not implemented)

for (x, y, wbox, hbox) in boxes:

cx = x + wbox / 2.0

cy = y + hbox / 2.0

pitch_name = map_y_to_pitch(cy, staff_lines)

n = note.Note(pitch_name)

n.duration.quarterLength = 1.0 # quarter note default

s.append(n)

return s

# -------------------------

# Main flow

# -------------------------

def process_image_to_midi(input_path, output_midi_path, debug=False):

img = load_image(input_path)

bin_img = binarize(img)

staff_lines = detect_staff_lines(bin_img, debug=debug)

if not staff_lines or len(staff_lines) < 5:

print("Warning: could not detect 5 staff lines reliably. Trying to proceed with available lines.")

else:

print("Detected staff lines (y-coordinates):", staff_lines)

boxes = detect_noteheads(bin_img, staff_lines, debug=debug)

if not boxes:

print("No noteheads detected. Exiting.")

return False

print(f"Detected {len(boxes)} candidate noteheads (left→right).")

for i, b in enumerate(boxes, start=1):

x, y, wbox, hbox = b

print(f"{i}: x={x}, y={y}, w={wbox}, h={hbox}")

music_stream = build_stream_from_boxes(boxes, staff_lines, tempo_bpm=100)

# Export to MIDI

mf = midi.translate.streamToMidiFile(music_stream)

mf.open(output_midi_path, 'wb')

mf.write()

mf.close()

print(f"MIDI saved to {output_midi_path}")

return True

# -------------------------

# CLI

# -------------------------

if __name__ == "__main__":

if len(sys.argv) < 3:

print("Usage: python sheet_to_midi.py input_image.png output.mid")

sys.exit(1)

inp = sys.argv[1]

out = sys.argv[2]

ok = process_image_to_midi(inp, out, debug=True)

if not ok:

sys.exit(2)

Python for Engineers

Blog Pages

Music Sheet to Audio Converter

No comments: