import os
import re
import pandas as pd
import whisper
from datetime import datetime
# Optional: For GPT-4 summarization
import openai
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# ========== CONFIG ==========
AUDIO_FOLDER = "audio"
TRANSCRIPT_FOLDER = "transcriptions"
NOTES_FOLDER = "notes_output"
# ========== SETUP ==========
os.makedirs(TRANSCRIPT_FOLDER, exist_ok=True)
os.makedirs(NOTES_FOLDER, exist_ok=True)
# ========== 1. Transcribe Audio ==========
def transcribe_audio(file_path, model_name="base"):
model = whisper.load_model(model_name)
result = model.transcribe(file_path)
filename = os.path.basename(file_path).split('.')[0]
output_path = os.path.join(TRANSCRIPT_FOLDER, f"{filename}.txt")
with open(output_path, "w", encoding="utf-8") as f:
f.write(result["text"])
return result["text"]
# ========== 2. Extract Action Items ==========
def extract_action_items(text):
bullet_pattern = r"(?:-|\*|\d\.)\s*(.+)"
action_keywords = ["should", "need to", "must", "let's", "we will", "assign", "follow up", "due"]
actions = []
for line in text.split('\n'):
line = line.strip()
if any(keyword in line.lower() for keyword in action_keywords):
actions.append(line)
# Fallback: try extracting bullets
bullets = re.findall(bullet_pattern, text)
for b in bullets:
if any(k in b.lower() for k in action_keywords):
actions.append(b)
return list(set(actions))
# ========== 3. Summarize with GPT (Optional) ==========
def summarize_with_gpt(transcript_text):
response = openai.ChatCompletion.create(
model="gpt-4-turbo",
messages=[
{"role": "system", "content": "You are an AI assistant that summarizes meeting transcripts."},
{"role": "user", "content": f"Summarize this meeting:\n\n{transcript_text}"}
]
)
return response['choices'][0]['message']['content']
# ========== 4. Save Final Notes ==========
def save_notes(transcript, actions, summary=None, filename="meeting_notes"):
now = datetime.now().strftime("%Y%m%d_%H%M")
csv_path = os.path.join(NOTES_FOLDER, f"{filename}_{now}.csv")
df = pd.DataFrame({
"Section": ["Transcript", "Action Items", "Summary"],
"Content": [transcript, "\n".join(actions), summary or "Not generated"]
})
df.to_csv(csv_path, index=False)
print(f"[✔] Notes saved to {csv_path}")
# ========== MAIN ==========
def process_meeting(file_path, use_gpt=False):
print(f"🔊 Transcribing: {file_path}")
transcript = transcribe_audio(file_path)
print("✅ Extracting action items...")
actions = extract_action_items(transcript)
summary = None
if use_gpt:
print("🤖 Summarizing with GPT...")
summary = summarize_with_gpt(transcript)
file_name = os.path.basename(file_path).split('.')[0]
save_notes(transcript, actions, summary, file_name)
# ========== RUN ==========
if __name__ == "__main__":
audio_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(('.mp3', '.wav'))]
if not audio_files:
print("⚠️ No audio files found in /audio folder.")
else:
for file in audio_files:
process_meeting(os.path.join(AUDIO_FOLDER, file), use_gpt=True)
No comments:
Post a Comment