pip install pymupdf nltk wordcloud
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import fitz # PyMuPDF
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import tkinter as tk
from tkinter import filedialog
import matplotlib.pyplot as plt
import string
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def clean_and_tokenize(text):
# Lowercase, remove punctuation
text = text.lower().translate(str.maketrans("", "", string.punctuation))
tokens = word_tokenize(text)
# Remove stopwords & short words
stop_words = set(stopwords.words("english"))
keywords = [word for word in tokens if word not in stop_words and len(word) > 2]
return keywords
def generate_wordcloud(keywords):
word_freq = nltk.FreqDist(keywords)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Top Keywords in Job Description", fontsize=16)
plt.show()
def main():
# Open file dialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename(title="Select Job Description PDF", filetypes=[("PDF Files", "*.pdf")])
if not file_path:
print("No file selected.")
return
print("Processing...")
text = extract_text_from_pdf(file_path)
keywords = clean_and_tokenize(text)
generate_wordcloud(keywords)
print("\nTop 20 Keywords:")
for word, freq in nltk.FreqDist(keywords).most_common(20):
print(f"{word} - {freq}")
if __name__ == "__main__":
main()
No comments:
Post a Comment