Document Similarity Checker

 import streamlit as st

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

import spacy


nlp = spacy.load("en_core_web_sm")


st.title("📄 Document Similarity Checker")


def preprocess(text):

    doc = nlp(text)

    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])


file1 = st.file_uploader("Upload First Document", type=["txt"])

file2 = st.file_uploader("Upload Second Document", type=["txt"])


if file1 and file2:

    text1 = file1.read().decode("utf-8")

    text2 = file2.read().decode("utf-8")


    clean_text1 = preprocess(text1)

    clean_text2 = preprocess(text2)


    tfidf = TfidfVectorizer()

    tfidf_matrix = tfidf.fit_transform([clean_text1, clean_text2])

    sim_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]


    st.subheader("🧮 Similarity Score")

    st.write(f"**{sim_score:.2f}** (1 = identical, 0 = completely different)")


    if sim_score > 0.75:

        st.success("The documents are quite similar! 🟢")

    elif sim_score > 0.4:

        st.info("The documents are moderately similar. 🟡")

    else:

        st.warning("The documents are quite different. 🔴")


No comments: