Blog Pages

Job Description Keyword Extractor

pip install pymupdf nltk wordcloud

import nltk
nltk.download('punkt')
nltk.download('stopwords')


import fitz  # PyMuPDF
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import tkinter as tk
from tkinter import filedialog
import matplotlib.pyplot as plt
import string

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def clean_and_tokenize(text):
    # Lowercase, remove punctuation
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)

    # Remove stopwords & short words
    stop_words = set(stopwords.words("english"))
    keywords = [word for word in tokens if word not in stop_words and len(word) > 2]
    return keywords

def generate_wordcloud(keywords):
    word_freq = nltk.FreqDist(keywords)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Top Keywords in Job Description", fontsize=16)
    plt.show()

def main():
    # Open file dialog
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(title="Select Job Description PDF", filetypes=[("PDF Files", "*.pdf")])

    if not file_path:
        print("No file selected.")
        return

    print("Processing...")
    text = extract_text_from_pdf(file_path)
    keywords = clean_and_tokenize(text)
    generate_wordcloud(keywords)

    print("\nTop 20 Keywords:")
    for word, freq in nltk.FreqDist(keywords).most_common(20):
        print(f"{word} - {freq}")

if __name__ == "__main__":
    main()

No comments:

Post a Comment