Blog Pages

AI Resume Ranker

import os

import re

import argparse

import glob

from typing import List, Tuple, Dict, Optional


import pandas as pd

import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

from sklearn.metrics.pairwise import cosine_similarity


# Optional parsers

import docx2txt

import PyPDF2


# Optional NLP

try:

    import spacy

    SPACY_OK = True

except Exception:

    SPACY_OK = False



# --------------------------- File Readers ---------------------------


def read_txt(path: str) -> str:

    with open(path, "r", encoding="utf-8", errors="ignore") as f:

        return f.read()


def read_docx(path: str) -> str:

    try:

        return docx2txt.process(path) or ""

    except Exception:

        return ""


def read_pdf(path: str) -> str:

    text = []

    try:

        with open(path, "rb") as f:

            reader = PyPDF2.PdfReader(f)

            for page in reader.pages:

                t = page.extract_text() or ""

                text.append(t)

    except Exception:

        pass

    return "\n".join(text)


def load_text_any(path: str) -> str:

    ext = os.path.splitext(path)[1].lower()

    if ext == ".txt":

        return read_txt(path)

    elif ext == ".docx":

        return read_docx(path)

    elif ext == ".pdf":

        return read_pdf(path)

    else:

        return ""



# --------------------------- Skills ---------------------------


DEFAULT_SKILLS = [

    # Generic

    "python","java","c++","javascript","typescript","sql","nosql","git","docker","kubernetes","linux",

    "aws","azure","gcp","bash","shell","rest","graphql","microservices",

    # Data/AI

    "pandas","numpy","scikit-learn","sklearn","tensorflow","pytorch","keras","nltk","spacy",

    "spark","hadoop","airflow","dbt","powerbi","tableau","matplotlib","seaborn",

    # Web/Backend

    "django","flask","fastapi","spring","node","express","react","angular","vue",

    # DevOps/Cloud

    "terraform","ansible","jenkins","ci/cd","prometheus","grafana","elk","rabbitmq","kafka",

    # Testing

    "pytest","unittest","selenium","cypress",

    # Security & Other

    "oauth","jwt","scrum","agile","jira"

]


def load_skills_file(path: Optional[str]) -> List[str]:

    if not path:

        return DEFAULT_SKILLS

    skills = []

    with open(path, "r", encoding="utf-8", errors="ignore") as f:

        for line in f:

            s = line.strip().lower()

            if s:

                skills.append(s)

    return sorted(set(skills))



# --------------------------- NLP Cleaning ---------------------------


def build_spacy_pipeline(use_spacy: bool):

    if use_spacy and SPACY_OK:

        try:

            nlp = spacy.load("en_core_web_sm", disable=["ner","parser","textcat"])

            return nlp

        except Exception:

            return None

    return None


CLEAN_RE = re.compile(r"[^a-z0-9+#./\- ]+")


def normalize_text(text: str) -> str:

    text = text.lower()

    text = text.replace("\n", " ").replace("\t", " ")

    text = CLEAN_RE.sub(" ", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text


def lemmatize_spacy(nlp, text: str) -> str:

    if not nlp:

        return text

    doc = nlp(text)

    return " ".join(tok.lemma_ for tok in doc if not tok.is_space)



# --------------------------- Feature Engineering ---------------------------


def skill_overlap_score(text: str, jd_skills: List[str]) -> float:

    """

    Compute a skill overlap score (0..1) = Jaccard-like:

    |skills_in_resume ∩ skills_in_jd| / |skills_in_jd|

    """

    text_tokens = set(re.findall(r"[a-z0-9+#.\-]+", text.lower()))

    resume_skills = set()

    for skill in jd_skills:

        tokens = skill.split()

        if len(tokens) == 1:

            if skill in text_tokens:

                resume_skills.add(skill)

        else:

            if skill in text:

                resume_skills.add(skill)

    if not jd_skills:

        return 0.0

    return len(resume_skills) / float(len(set(jd_skills)))



# --------------------------- Ranking ---------------------------


def rank_resumes(

    jd_text: str,

    resume_texts: Dict[str, str],

    use_spacy: bool = False,

    weights: Tuple[float, float] = (0.7, 0.3),

    custom_skills: Optional[List[str]] = None

) -> pd.DataFrame:


    w_sem, w_skill = weights

    assert abs((w_sem + w_skill) - 1.0) < 1e-6, "weights must sum to 1"


    # Prepare spaCy if requested

    nlp = build_spacy_pipeline(use_spacy)


    # Normalize & (optionally) lemmatize

    jd_clean = normalize_text(jd_text)

    if nlp:

        jd_clean = lemmatize_spacy(nlp, jd_clean)


    cleaned = {}

    for fname, txt in resume_texts.items():

        t = normalize_text(txt)

        if nlp:

            t = lemmatize_spacy(nlp, t)

        cleaned[fname] = t


    # TF-IDF across JD + Resumes

    vectorizer = TfidfVectorizer(stop_words="english", max_features=40000, ngram_range=(1,2))

    corpus = [jd_clean] + [cleaned[f] for f in cleaned]

    tfidf = vectorizer.fit_transform(corpus)


    # Cosine similarity of resumes against JD (index 0)

    sims = cosine_similarity(tfidf[0:1], tfidf[1:]).flatten()


    # Skills from JD text (intersect default skills + those present in JD)

    base_skills = custom_skills if custom_skills is not None else DEFAULT_SKILLS

    jd_skill_candidates = [s for s in base_skills if s in jd_clean]

    # Fallback: if no skills found in JD, keep base set (less strict)

    jd_skillset = jd_skill_candidates if jd_skill_candidates else base_skills


    # Skill overlap score per resume

    files = list(cleaned.keys())

    skill_scores = []

    for f in files:

        s = skill_overlap_score(cleaned[f], jd_skillset)

        skill_scores.append(s)


    # Final score

    final = w_sem * sims + w_skill * np.array(skill_scores)


    df = pd.DataFrame({

        "resume_file": files,

        "semantic_similarity": np.round(sims, 4),

        "skill_overlap": np.round(skill_scores, 4),

        "final_score": np.round(final, 4)

    }).sort_values("final_score", ascending=False).reset_index(drop=True)


    return df



# --------------------------- CLI ---------------------------


def main():

    parser = argparse.ArgumentParser(description="AI Resume Ranker — rank resumes against a job description.")

    parser.add_argument("--jd", required=True, help="Job description file (.txt/.pdf/.docx)")

    parser.add_argument("--resumes", required=True, help="Folder containing resumes (.txt/.pdf/.docx)")

    parser.add_argument("--export", default="ranked_resumes.csv", help="Path to export CSV results")

    parser.add_argument("--skills", default=None, help="Optional skills file (one skill per line)")

    parser.add_argument("--use-spacy", action="store_true", help="Enable spaCy lemmatization (install en_core_web_sm)")

    parser.add_argument("--weights", nargs=2, type=float, default=[0.7, 0.3],

                        help="Weights for [semantic_similarity skill_overlap], must sum to 1.0 (default 0.7 0.3)")

    args = parser.parse_args()


    # Load JD

    jd_text = load_text_any(args.jd)

    if not jd_text.strip():

        raise SystemExit(f"Could not read job description: {args.jd}")


    # Load resumes

    patterns = ["*.pdf", "*.docx", "*.txt"]

    files = []

    for p in patterns:

        files.extend(glob.glob(os.path.join(args.resumes, p)))

    if not files:

        raise SystemExit(f"No resumes found in: {args.resumes}")


    resume_texts = {}

    for f in files:

        txt = load_text_any(f)

        if txt.strip():

            resume_texts[os.path.basename(f)] = txt


    # Load skills (optional)

    custom_skills = load_skills_file(args.skills) if args.skills else None


    # Rank

    df = rank_resumes(

        jd_text=jd_text,

        resume_texts=resume_texts,

        use_spacy=args.use_spacy,

        weights=(args.weights[0], args.weights[1]),

        custom_skills=custom_skills

    )


    # Save

    df.to_csv(args.export, index=False)

    print("\nTop matches:")

    print(df.head(10).to_string(index=False))

    print(f"\nSaved results to: {args.export}")



if __name__ == "__main__":

    main()


No comments:

Post a Comment