Python for Engineers : Job Description Keyword Extractor

pip install pymupdf nltk wordcloud

import nltk

nltk.download('punkt')

nltk.download('stopwords')

import fitz # PyMuPDF

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

import tkinter as tk

from tkinter import filedialog

import matplotlib.pyplot as plt

import string

def extract_text_from_pdf(pdf_path):

doc = fitz.open(pdf_path)

text = ""

for page in doc:

text += page.get_text()

return text

def clean_and_tokenize(text):

# Lowercase, remove punctuation

text = text.lower().translate(str.maketrans("", "", string.punctuation))

tokens = word_tokenize(text)

# Remove stopwords & short words

stop_words = set(stopwords.words("english"))

keywords = [word for word in tokens if word not in stop_words and len(word) > 2]

return keywords

def generate_wordcloud(keywords):

word_freq = nltk.FreqDist(keywords)

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(10, 5))

plt.imshow(wordcloud, interpolation="bilinear")

plt.axis("off")

plt.title("Top Keywords in Job Description", fontsize=16)

plt.show()

def main():

# Open file dialog

root = tk.Tk()

root.withdraw()

file_path = filedialog.askopenfilename(title="Select Job Description PDF", filetypes=[("PDF Files", "*.pdf")])

if not file_path:

print("No file selected.")

return

print("Processing...")

text = extract_text_from_pdf(file_path)

keywords = clean_and_tokenize(text)

generate_wordcloud(keywords)

print("\nTop 20 Keywords:")

for word, freq in nltk.FreqDist(keywords).most_common(20):

print(f"{word} - {freq}")

if __name__ == "__main__":

main()

Python for Engineers

Blog Pages

Job Description Keyword Extractor

No comments:

Post a Comment