import os
import re
import argparse
import glob
from typing import List, Tuple, Dict, Optional
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
# Optional parsers
import docx2txt
import PyPDF2
# Optional NLP
try:
import spacy
SPACY_OK = True
except Exception:
SPACY_OK = False
# --------------------------- File Readers ---------------------------
def read_txt(path: str) -> str:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def read_docx(path: str) -> str:
try:
return docx2txt.process(path) or ""
except Exception:
return ""
def read_pdf(path: str) -> str:
text = []
try:
with open(path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
t = page.extract_text() or ""
text.append(t)
except Exception:
pass
return "\n".join(text)
def load_text_any(path: str) -> str:
ext = os.path.splitext(path)[1].lower()
if ext == ".txt":
return read_txt(path)
elif ext == ".docx":
return read_docx(path)
elif ext == ".pdf":
return read_pdf(path)
else:
return ""
# --------------------------- Skills ---------------------------
DEFAULT_SKILLS = [
# Generic
"python","java","c++","javascript","typescript","sql","nosql","git","docker","kubernetes","linux",
"aws","azure","gcp","bash","shell","rest","graphql","microservices",
# Data/AI
"pandas","numpy","scikit-learn","sklearn","tensorflow","pytorch","keras","nltk","spacy",
"spark","hadoop","airflow","dbt","powerbi","tableau","matplotlib","seaborn",
# Web/Backend
"django","flask","fastapi","spring","node","express","react","angular","vue",
# DevOps/Cloud
"terraform","ansible","jenkins","ci/cd","prometheus","grafana","elk","rabbitmq","kafka",
# Testing
"pytest","unittest","selenium","cypress",
# Security & Other
"oauth","jwt","scrum","agile","jira"
]
def load_skills_file(path: Optional[str]) -> List[str]:
if not path:
return DEFAULT_SKILLS
skills = []
with open(path, "r", encoding="utf-8", errors="ignore") as f:
for line in f:
s = line.strip().lower()
if s:
skills.append(s)
return sorted(set(skills))
# --------------------------- NLP Cleaning ---------------------------
def build_spacy_pipeline(use_spacy: bool):
if use_spacy and SPACY_OK:
try:
nlp = spacy.load("en_core_web_sm", disable=["ner","parser","textcat"])
return nlp
except Exception:
return None
return None
CLEAN_RE = re.compile(r"[^a-z0-9+#./\- ]+")
def normalize_text(text: str) -> str:
text = text.lower()
text = text.replace("\n", " ").replace("\t", " ")
text = CLEAN_RE.sub(" ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def lemmatize_spacy(nlp, text: str) -> str:
if not nlp:
return text
doc = nlp(text)
return " ".join(tok.lemma_ for tok in doc if not tok.is_space)
# --------------------------- Feature Engineering ---------------------------
def skill_overlap_score(text: str, jd_skills: List[str]) -> float:
"""
Compute a skill overlap score (0..1) = Jaccard-like:
|skills_in_resume ∩ skills_in_jd| / |skills_in_jd|
"""
text_tokens = set(re.findall(r"[a-z0-9+#.\-]+", text.lower()))
resume_skills = set()
for skill in jd_skills:
tokens = skill.split()
if len(tokens) == 1:
if skill in text_tokens:
resume_skills.add(skill)
else:
if skill in text:
resume_skills.add(skill)
if not jd_skills:
return 0.0
return len(resume_skills) / float(len(set(jd_skills)))
# --------------------------- Ranking ---------------------------
def rank_resumes(
jd_text: str,
resume_texts: Dict[str, str],
use_spacy: bool = False,
weights: Tuple[float, float] = (0.7, 0.3),
custom_skills: Optional[List[str]] = None
) -> pd.DataFrame:
w_sem, w_skill = weights
assert abs((w_sem + w_skill) - 1.0) < 1e-6, "weights must sum to 1"
# Prepare spaCy if requested
nlp = build_spacy_pipeline(use_spacy)
# Normalize & (optionally) lemmatize
jd_clean = normalize_text(jd_text)
if nlp:
jd_clean = lemmatize_spacy(nlp, jd_clean)
cleaned = {}
for fname, txt in resume_texts.items():
t = normalize_text(txt)
if nlp:
t = lemmatize_spacy(nlp, t)
cleaned[fname] = t
# TF-IDF across JD + Resumes
vectorizer = TfidfVectorizer(stop_words="english", max_features=40000, ngram_range=(1,2))
corpus = [jd_clean] + [cleaned[f] for f in cleaned]
tfidf = vectorizer.fit_transform(corpus)
# Cosine similarity of resumes against JD (index 0)
sims = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()
# Skills from JD text (intersect default skills + those present in JD)
base_skills = custom_skills if custom_skills is not None else DEFAULT_SKILLS
jd_skill_candidates = [s for s in base_skills if s in jd_clean]
# Fallback: if no skills found in JD, keep base set (less strict)
jd_skillset = jd_skill_candidates if jd_skill_candidates else base_skills
# Skill overlap score per resume
files = list(cleaned.keys())
skill_scores = []
for f in files:
s = skill_overlap_score(cleaned[f], jd_skillset)
skill_scores.append(s)
# Final score
final = w_sem * sims + w_skill * np.array(skill_scores)
df = pd.DataFrame({
"resume_file": files,
"semantic_similarity": np.round(sims, 4),
"skill_overlap": np.round(skill_scores, 4),
"final_score": np.round(final, 4)
}).sort_values("final_score", ascending=False).reset_index(drop=True)
return df
# --------------------------- CLI ---------------------------
def main():
parser = argparse.ArgumentParser(description="AI Resume Ranker — rank resumes against a job description.")
parser.add_argument("--jd", required=True, help="Job description file (.txt/.pdf/.docx)")
parser.add_argument("--resumes", required=True, help="Folder containing resumes (.txt/.pdf/.docx)")
parser.add_argument("--export", default="ranked_resumes.csv", help="Path to export CSV results")
parser.add_argument("--skills", default=None, help="Optional skills file (one skill per line)")
parser.add_argument("--use-spacy", action="store_true", help="Enable spaCy lemmatization (install en_core_web_sm)")
parser.add_argument("--weights", nargs=2, type=float, default=[0.7, 0.3],
help="Weights for [semantic_similarity skill_overlap], must sum to 1.0 (default 0.7 0.3)")
args = parser.parse_args()
# Load JD
jd_text = load_text_any(args.jd)
if not jd_text.strip():
raise SystemExit(f"Could not read job description: {args.jd}")
# Load resumes
patterns = ["*.pdf", "*.docx", "*.txt"]
files = []
for p in patterns:
files.extend(glob.glob(os.path.join(args.resumes, p)))
if not files:
raise SystemExit(f"No resumes found in: {args.resumes}")
resume_texts = {}
for f in files:
txt = load_text_any(f)
if txt.strip():
resume_texts[os.path.basename(f)] = txt
# Load skills (optional)
custom_skills = load_skills_file(args.skills) if args.skills else None
# Rank
df = rank_resumes(
jd_text=jd_text,
resume_texts=resume_texts,
use_spacy=args.use_spacy,
weights=(args.weights[0], args.weights[1]),
custom_skills=custom_skills
)
# Save
df.to_csv(args.export, index=False)
print("\nTop matches:")
print(df.head(10).to_string(index=False))
print(f"\nSaved results to: {args.export}")
if __name__ == "__main__":
main()
No comments:
Post a Comment