Plagiarism Detector

 import difflib

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity


def read_file(filename):

    with open(filename, 'r', encoding='utf-8') as file:

        return file.read()


def basic_diff_score(text1, text2):

    seq = difflib.SequenceMatcher(None, text1, text2)

    return round(seq.ratio() * 100, 2)


def nlp_cosine_similarity(text1, text2):

    tfidf = TfidfVectorizer().fit_transform([text1, text2])

    score = cosine_similarity(tfidf[0:1], tfidf[1:2])

    return round(score[0][0] * 100, 2)


def main():

    file1 = input("Enter path to first file: ")

    file2 = input("Enter path to second file: ")


    text1 = read_file(file1)

    text2 = read_file(file2)


    basic_score = basic_diff_score(text1, text2)

    nlp_score = nlp_cosine_similarity(text1, text2)


    print("\n--- Plagiarism Detection Result ---")

    print(f"Simple Match (difflib): {basic_score}%")

    print(f"Semantic Match (TF-IDF Cosine Similarity): {nlp_score}%")


    if nlp_score > 80:

        print("⚠️ High similarity detected. Possible plagiarism.")

    elif nlp_score > 50:

        print("⚠️ Moderate similarity. Review recommended.")

    else:

        print("✅ Low similarity. Likely original.")


if __name__ == "__main__":

    main()


No comments: