Python for Engineers : Document Similarity Checker

import streamlit as st

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

import spacy

nlp = spacy.load("en_core_web_sm")

st.title("📄 Document Similarity Checker")

def preprocess(text):

doc = nlp(text)

return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

file1 = st.file_uploader("Upload First Document", type=["txt"])

file2 = st.file_uploader("Upload Second Document", type=["txt"])

if file1 and file2:

text1 = file1.read().decode("utf-8")

text2 = file2.read().decode("utf-8")

clean_text1 = preprocess(text1)

clean_text2 = preprocess(text2)

tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform([clean_text1, clean_text2])

sim_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

st.subheader("🧮 Similarity Score")

st.write(f"**{sim_score:.2f}** (1 = identical, 0 = completely different)")

if sim_score > 0.75:

st.success("The documents are quite similar! 🟢")

elif sim_score > 0.4:

st.info("The documents are moderately similar. 🟡")

else:

st.warning("The documents are quite different. 🔴")

Python for Engineers