AI-Focused Book Summarizer & Chapter Highlighter

import fitz  # PyMuPDF

import nltk

import re

from transformers import pipeline


# Load HuggingFace summarizer

summarizer = pipeline(

    "summarization", 

    model="facebook/bart-large-cnn"

)


# ----------------------------------------

# 1. Extract text from PDF

# ----------------------------------------

def extract_pdf_text(pdf_path):

    doc = fitz.open(pdf_path)

    full_text = ""


    for page in doc:

        full_text += page.get_text()


    return full_text



# ----------------------------------------

# 2. Split text into chapters

#    Uses patterns like:

#     - Chapter 1

#     - CHAPTER I

#     - CHAPTER ONE

# ----------------------------------------

def split_into_chapters(text):

    chapter_pattern = r"(chapter\s+\d+|chapter\s+[ivxlcdm]+|chapter\s+\w+)"

    found = re.split(chapter_pattern, text, flags=re.IGNORECASE)


    chapters = []

    for i in range(1, len(found), 2):

        title = found[i].strip()

        content = found[i + 1].strip()

        chapters.append((title, content))


    # If no chapters are detected → return full book as one chapter

    if not chapters:

        return [("Full Book", text)]


    return chapters



# ----------------------------------------

# 3. Summarize long text in safe chunks

# ----------------------------------------

def summarize_long_text(text, max_chunk=1500):

    nltk.download("punkt", quiet=True)


    sentences = nltk.sent_tokenize(text)

    chunks = []

    current_chunk = ""


    for sentence in sentences:

        if len(current_chunk) + len(sentence) <= max_chunk:

            current_chunk += " " + sentence

        else:

            chunks.append(current_chunk)

            current_chunk = sentence


    if current_chunk:

        chunks.append(current_chunk)


    # Summarize each chunk

    summaries = []

    for chunk in chunks:

        summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]

        summaries.append(summary)


    # Join all summaries

    return "\n".join(summaries)



# ----------------------------------------

# 4. Generate key points

# ----------------------------------------

def generate_key_points(summary):

    sentences = nltk.sent_tokenize(summary)

    key_points = sentences[:5]   # Top 5 key ideas

    return ["• " + s for s in key_points]



# ----------------------------------------

# 5. End-to-End Pipeline

# ----------------------------------------

def summarize_book(pdf_path):

    print("\nšŸ“˜ Extracting PDF text...")

    text = extract_pdf_text(pdf_path)


    print("✂ Splitting into chapters...")

    chapters = split_into_chapters(text)


    results = []


    for idx, (title, content) in enumerate(chapters, start=1):

        print(f"\nšŸ“ Summarizing {title}...")


        summary = summarize_long_text(content)

        key_points = generate_key_points(summary)


        results.append({

            "chapter_title": title,

            "summary": summary,

            "key_points": key_points

        })


    return results



# ----------------------------------------

# 6. Print Results Nicely

# ----------------------------------------

def display_results(results):

    print("\n============================")

    print("šŸ“š BOOK SUMMARY REPORT")

    print("============================\n")


    for item in results:

        print(f"\n===== {item['chapter_title']} =====\n")

        print("SUMMARY:\n")

        print(item["summary"])


        print("\nKEY POINTS:")

        for p in item["key_points"]:

            print(p)


        print("\n---------------------------")



# ----------------------------------------

# RUN

# ----------------------------------------

if __name__ == "__main__":

    pdf_path = input("Enter PDF file path: ").strip()


    results = summarize_book(pdf_path)

    display_results(results)


    print("\n✔ Done! All chapters processed.")


No comments: