import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def read_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
return file.read()
def basic_diff_score(text1, text2):
seq = difflib.SequenceMatcher(None, text1, text2)
return round(seq.ratio() * 100, 2)
def nlp_cosine_similarity(text1, text2):
tfidf = TfidfVectorizer().fit_transform([text1, text2])
score = cosine_similarity(tfidf[0:1], tfidf[1:2])
return round(score[0][0] * 100, 2)
def main():
file1 = input("Enter path to first file: ")
file2 = input("Enter path to second file: ")
text1 = read_file(file1)
text2 = read_file(file2)
basic_score = basic_diff_score(text1, text2)
nlp_score = nlp_cosine_similarity(text1, text2)
print("\n--- Plagiarism Detection Result ---")
print(f"Simple Match (difflib): {basic_score}%")
print(f"Semantic Match (TF-IDF Cosine Similarity): {nlp_score}%")
if nlp_score > 80:
print("⚠️ High similarity detected. Possible plagiarism.")
elif nlp_score > 50:
print("⚠️ Moderate similarity. Review recommended.")
else:
print("✅ Low similarity. Likely original.")
if __name__ == "__main__":
main()