import streamlit as st
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")
st.title("📄 Document Similarity Checker")
def preprocess(text):
doc = nlp(text)
return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])
file1 = st.file_uploader("Upload First Document", type=["txt"])
file2 = st.file_uploader("Upload Second Document", type=["txt"])
if file1 and file2:
text1 = file1.read().decode("utf-8")
text2 = file2.read().decode("utf-8")
clean_text1 = preprocess(text1)
clean_text2 = preprocess(text2)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([clean_text1, clean_text2])
sim_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
st.subheader("🧮 Similarity Score")
st.write(f"**{sim_score:.2f}** (1 = identical, 0 = completely different)")
if sim_score > 0.75:
st.success("The documents are quite similar! 🟢")
elif sim_score > 0.4:
st.info("The documents are moderately similar. 🟡")
else:
st.warning("The documents are quite different. 🔴")
No comments:
Post a Comment