Voice Emotion Detector

import os

import librosa

import numpy as np

import sounddevice as sd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder, StandardScaler

import pickle


# -----------------------

# STEP 1: Feature Extraction

# -----------------------

def extract_features(file_path):

    y, sr = librosa.load(file_path, duration=3, offset=0.5)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

    mel = librosa.feature.melspectrogram(y=y, sr=sr)

    

    # Take mean of each feature

    mfccs = np.mean(mfcc.T, axis=0)

    chroma = np.mean(chroma.T, axis=0)

    mel = np.mean(mel.T, axis=0)


    return np.hstack([mfccs, chroma, mel])


# -----------------------

# STEP 2: Training (Demo Dataset Simulation)

# -----------------------

def train_model():

    # Normally, load a dataset (RAVDESS, CREMA-D etc.)

    # Here, we'll simulate with few .wav files in "dataset/" folder

    

    emotions = {

        "angry": "angry",

        "happy": "happy",

        "sad": "sad",

        "neutral": "neutral"

    }

    

    X, y = [], []

    dataset_path = "dataset"  # folder with wav files: angry1.wav, happy2.wav, etc.

    

    for file in os.listdir(dataset_path):

        if file.endswith(".wav"):

            label = file.split("_")[0]  # e.g., angry_1.wav → "angry"

            feature = extract_features(os.path.join(dataset_path, file))

            X.append(feature)

            y.append(label)

    

    X = np.array(X)

    y = np.array(y)

    

    # Encode labels

    encoder = LabelEncoder()

    y = encoder.fit_transform(y)

    

    scaler = StandardScaler()

    X = scaler.fit_transform(X)

    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    

    model = SVC(kernel="linear", probability=True)

    model.fit(X_train, y_train)

    

    acc = model.score(X_test, y_test)

    print(f"Model trained with accuracy: {acc*100:.2f}%")

    

    # Save model

    with open("emotion_model.pkl", "wb") as f:

        pickle.dump((model, encoder, scaler), f)


# -----------------------

# STEP 3: Record & Predict

# -----------------------

def record_and_predict(duration=3, fs=22050):

    print("Recording...")

    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)

    sd.wait()

    print("Recording complete. Saving as temp.wav...")

    librosa.output.write_wav("temp.wav", recording.flatten(), sr=fs)


    with open("emotion_model.pkl", "rb") as f:

        model, encoder, scaler = pickle.load(f)

    

    features = extract_features("temp.wav").reshape(1, -1)

    features = scaler.transform(features)

    pred = model.predict(features)[0]

    probas = model.predict_proba(features)[0]

    

    emotion = encoder.inverse_transform([pred])[0]

    print(f"Detected Emotion: {emotion}")

    

    # Plot probabilities

    plt.bar(encoder.classes_, probas)

    plt.title("Emotion Prediction Confidence")

    plt.show()


# -----------------------

# MAIN

# -----------------------

if __name__ == "__main__":

    if not os.path.exists("emotion_model.pkl"):

        print("Training model...")

        train_model()

    

    record_and_predict()


No comments: