import fitz # PyMuPDF
import nltk
import re
from transformers import pipeline
# Load HuggingFace summarizer
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn"
)
# ----------------------------------------
# 1. Extract text from PDF
# ----------------------------------------
def extract_pdf_text(pdf_path):
doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
full_text += page.get_text()
return full_text
# ----------------------------------------
# 2. Split text into chapters
# Uses patterns like:
# - Chapter 1
# - CHAPTER I
# - CHAPTER ONE
# ----------------------------------------
def split_into_chapters(text):
chapter_pattern = r"(chapter\s+\d+|chapter\s+[ivxlcdm]+|chapter\s+\w+)"
found = re.split(chapter_pattern, text, flags=re.IGNORECASE)
chapters = []
for i in range(1, len(found), 2):
title = found[i].strip()
content = found[i + 1].strip()
chapters.append((title, content))
# If no chapters are detected → return full book as one chapter
if not chapters:
return [("Full Book", text)]
return chapters
# ----------------------------------------
# 3. Summarize long text in safe chunks
# ----------------------------------------
def summarize_long_text(text, max_chunk=1500):
nltk.download("punkt", quiet=True)
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chunk:
current_chunk += " " + sentence
else:
chunks.append(current_chunk)
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
# Summarize each chunk
summaries = []
for chunk in chunks:
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
summaries.append(summary)
# Join all summaries
return "\n".join(summaries)
# ----------------------------------------
# 4. Generate key points
# ----------------------------------------
def generate_key_points(summary):
sentences = nltk.sent_tokenize(summary)
key_points = sentences[:5] # Top 5 key ideas
return ["• " + s for s in key_points]
# ----------------------------------------
# 5. End-to-End Pipeline
# ----------------------------------------
def summarize_book(pdf_path):
print("\nš Extracting PDF text...")
text = extract_pdf_text(pdf_path)
print("✂ Splitting into chapters...")
chapters = split_into_chapters(text)
results = []
for idx, (title, content) in enumerate(chapters, start=1):
print(f"\nš Summarizing {title}...")
summary = summarize_long_text(content)
key_points = generate_key_points(summary)
results.append({
"chapter_title": title,
"summary": summary,
"key_points": key_points
})
return results
# ----------------------------------------
# 6. Print Results Nicely
# ----------------------------------------
def display_results(results):
print("\n============================")
print("š BOOK SUMMARY REPORT")
print("============================\n")
for item in results:
print(f"\n===== {item['chapter_title']} =====\n")
print("SUMMARY:\n")
print(item["summary"])
print("\nKEY POINTS:")
for p in item["key_points"]:
print(p)
print("\n---------------------------")
# ----------------------------------------
# RUN
# ----------------------------------------
if __name__ == "__main__":
pdf_path = input("Enter PDF file path: ").strip()
results = summarize_book(pdf_path)
display_results(results)
print("\n✔ Done! All chapters processed.")
No comments:
Post a Comment