Music Sheet to Audio Converter

 """

sheet_to_midi.py


Simple prototype: Convert a scanned single-line, monophonic staff in TREBLE CLEF

to a MIDI file using OpenCV -> heuristic notehead detection -> music21.


Limitations:

 - Monophonic, printed notation, single staff detection.

 - Treats each notehead as a quarter note by default.

 - No clef/key/time signature detection (assumes treble clef, 4/4).

 - Not a replacement for full OMR systems like Audiveris.


Usage:

    python sheet_to_midi.py input_image.png output.mid

"""


import sys

import cv2

import numpy as np

import math

from music21 import stream, note, midi, tempo, meter

from PIL import Image


# -------------------------

# Utility & image helpers

# -------------------------

def load_image(path):

    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)

    if img is None:

        raise FileNotFoundError(f"Cannot open image: {path}")

    return img


def binarize(img):

    # Adaptive threshold - robust to lighting

    th = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,

                               cv2.THRESH_BINARY_INV, 15, 10)

    return th


# -------------------------

# Staff line detection

# -------------------------

def detect_staff_lines(binary_img, debug=False):

    """

    Detect horizontal staff lines using morphological operations and Hough or projection.

    Returns list of y-positions of detected lines (sorted).

    """

    h, w = binary_img.shape


    # Use horizontal morphological kernel to enhance staff lines

    horizontal_size = max(10, w // 30)

    horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))

    hor = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, horiz_kernel)


    # Sum across columns to get projection

    proj = np.sum(hor, axis=1)

    # Normalize

    proj = (proj - proj.min()) / (proj.max() - proj.min() + 1e-9)


    # Find peaks in projection where staff lines are

    thresh = 0.15  # tunable

    candidates = np.where(proj > thresh)[0]


    if len(candidates) == 0:

        return []


    # Group contiguous regions into single lines (cluster by gaps)

    lines = []

    current = [candidates[0]]

    for r in candidates[1:]:

        if r - current[-1] <= 2:

            current.append(r)

        else:

            # average

            lines.append(int(np.mean(current)))

            current = [r]

    if current:

        lines.append(int(np.mean(current)))


    # Staffs are sets of 5 lines close to each other. Find clusters of 5 lines

    # For simplicity, find any groups of 5 lines with roughly equal spacing

    # If more than 5 lines are present (multiple staves), return the first 5-line group

    if len(lines) < 5:

        return lines  # fallback


    # sliding window of size 5, measure spacing variance

    best_group = None

    best_score = 1e9

    for i in range(0, len(lines) - 4):

        group = lines[i:i+5]

        spacings = np.diff(group)

        score = np.var(spacings)  # we want equal spacings

        if score < best_score:

            best_score = score

            best_group = group


    if best_group is None:

        return lines[:5]

    return best_group


# -------------------------

# Note head detection

# -------------------------

def detect_noteheads(binary_img, staff_lines, debug=False):

    """

    Detect connected components that look like noteheads.

    Return list of bounding boxes (x, y, w, h).

    """

    # Remove staff lines from binary image to avoid splitting noteheads:

    img_nolines = binary_img.copy()

    # Create a mask of lines using morphological ops similar to detection

    h, w = binary_img.shape

    horizontal_size = max(10, w // 30)

    horiz_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))

    hor = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, horiz_kernel)

    img_nolines = cv2.bitwise_and(img_nolines, cv2.bitwise_not(hor))


    # Morph close small gaps to make noteheads full blobs

    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3,3))

    img_nolines = cv2.morphologyEx(img_nolines, cv2.MORPH_CLOSE, kernel, iterations=1)


    # Find contours

    contours, _ = cv2.findContours(img_nolines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)


    boxes = []

    for cnt in contours:

        x, y, wbox, hbox = cv2.boundingRect(cnt)

        area = cv2.contourArea(cnt)

        # heuristics for notehead sizes

        if area < 30:  # too small noise

            continue

        # discard very tall/thin objects (likely stems or flags)

        if hbox > 3 * wbox and hbox > 40:

            continue

        # also discard huge regions (like staff text)

        if wbox > binary_img.shape[1] * 0.6:

            continue

        boxes.append((x, y, wbox, hbox))


    # Sort left-to-right

    boxes.sort(key=lambda b: b[0])

    return boxes


# -------------------------

# Map vertical position to pitch (treble clef)

# -------------------------

def map_y_to_pitch(y_center, staff_lines):

    """

    Given y-coordinate and list of 5 staff line y-positions (top->bottom),

    compute the pitch name using treble clef mapping.

    We'll map lines & spaces to steps; middle C is one ledger line below staff in treble clef.

    The mapping: from top line down:

      Line 1 (top) -> F5

      Space -> E5

      Line 2 -> D5

      ...

    We'll build a scale of positions (lines and spaces) with corresponding MIDI note numbers.

    """

    # Convert staff_lines sorted top->bottom

    lines = sorted(staff_lines)

    # staff spacing

    spacing = np.median(np.diff(lines))

    # Build reference positions: lines and spaces extending several positions above/below

    # We'll define positions with index 0 at top line, increasing downward by half-step (line->space->line)

    positions = []

    labels = []  # MIDI numbers

    # Let's compute the center y of each "position" for -6..+12 positions relative to top line

    # Determine MIDI mapping: top line (F5) midi 77. Use standard: F5=77, E5=76, D5=74? Wait careful...

    # Simpler: define mapping for relative positions using steps in diatonic scale (not semitone), but easiest is map to note names by index:

    # We'll build a list of note names starting from some reference. Let's compute using music21 for correctness.

    from music21 import pitch

    # We'll compute positions: every half staff-step is spacing/2

    half = spacing / 2.0

    # Let's create position centers from -6 to +18 (enough ledger lines)

    pos_centers = [lines[0] - 6*half + i*half for i in range(40)]

    # Now assign note names: find which position corresponds to which diatonic step.

    # Determine which index corresponds to the top staff line (lines[0])

    idx_top_line = int(round((lines[0] - pos_centers[0]) / half))

    # For treble clef: top line is F5 (MIDI 77)

    top_midi = pitch.Pitch('F5').midi  # 77

    # Each position step (line->space->line) moves by one diatonic step (i.e., one scale degree), which may be 1 or 2 semitones.

    # But easier: we can build a list of midi numbers by moving by semitone steps of a diatonic scale: approximate by mapping every position to midi by using

    # semitone step of 1 for each half-step (this maps to chromatic steps which is fine but won't respect staff spacing perfectly for accidentals).

    # Simpler: treat each position as semitone steps from top line: top line index -> top_midi, next half position -> top_midi - 1, etc.

    # This yields a chromatic mapping: adjacent positions = 1 semitone. This is a simplification (in real staff adjacent positions are diatonic).

    midi_for_pos = []

    for i in range(len(pos_centers)):

        midi_for_pos.append(top_midi - (i - idx_top_line))


    # Identify closest pos index for given y_center

    diffs = [abs(y_center - c) for c in pos_centers]

    pos_idx = int(np.argmin(diffs))

    midi = int(round(midi_for_pos[pos_idx]))

    # Convert midi to note name

    p = pitch.Pitch()

    p.midi = midi

    return p.nameWithOctave


# -------------------------

# Build music21 stream from detected notes

# -------------------------

def build_stream_from_boxes(boxes, staff_lines, tempo_bpm=100):

    s = stream.Stream()

    s.append(tempo.MetronomeMark(number=tempo_bpm))

    # Simple 4/4 time signature

    s.append(meter.TimeSignature('4/4'))


    # For each bounding box left->right, map to pitch and create quarter notes

    # More advanced: group boxes near same x to chords, or detect stems to find durations (not implemented)

    for (x, y, wbox, hbox) in boxes:

        cx = x + wbox / 2.0

        cy = y + hbox / 2.0

        pitch_name = map_y_to_pitch(cy, staff_lines)

        n = note.Note(pitch_name)

        n.duration.quarterLength = 1.0  # quarter note default

        s.append(n)

    return s


# -------------------------

# Main flow

# -------------------------

def process_image_to_midi(input_path, output_midi_path, debug=False):

    img = load_image(input_path)

    bin_img = binarize(img)


    staff_lines = detect_staff_lines(bin_img, debug=debug)

    if not staff_lines or len(staff_lines) < 5:

        print("Warning: could not detect 5 staff lines reliably. Trying to proceed with available lines.")

    else:

        print("Detected staff lines (y-coordinates):", staff_lines)


    boxes = detect_noteheads(bin_img, staff_lines, debug=debug)

    if not boxes:

        print("No noteheads detected. Exiting.")

        return False


    print(f"Detected {len(boxes)} candidate noteheads (left→right).")

    for i, b in enumerate(boxes, start=1):

        x, y, wbox, hbox = b

        print(f"{i}: x={x}, y={y}, w={wbox}, h={hbox}")


    music_stream = build_stream_from_boxes(boxes, staff_lines, tempo_bpm=100)


    # Export to MIDI

    mf = midi.translate.streamToMidiFile(music_stream)

    mf.open(output_midi_path, 'wb')

    mf.write()

    mf.close()

    print(f"MIDI saved to {output_midi_path}")

    return True


# -------------------------

# CLI

# -------------------------

if __name__ == "__main__":

    if len(sys.argv) < 3:

        print("Usage: python sheet_to_midi.py input_image.png output.mid")

        sys.exit(1)

    inp = sys.argv[1]

    out = sys.argv[2]

    ok = process_image_to_midi(inp, out, debug=True)

    if not ok:

        sys.exit(2)


No comments: