import streamlit as st from docx import Document import re from collections import Counter from math import sqrt import nltk from nltk.corpus import stopwords, wordnet from nltk.tokenize import word_tokenize, sent_tokenize import spacy import matplotlib.pyplot as plt import io import base64 # Load spaCy model for semantic analysis nlp = spacy.load("en_core_web_md") # Download necessary NLTK data nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('wordnet', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) def read_file_content(uploaded_file): if uploaded_file.type == "text/plain": return uploaded_file.getvalue().decode("utf-8") elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": doc = Document(uploaded_file) return " ".join([paragraph.text for paragraph in doc.paragraphs]) else: raise ValueError("Unsupported file type") def preprocess_text(text): text = re.sub(r'[^\w\s]', '', text.lower()) stop_words = set(stopwords.words('english')) tokens = word_tokenize(text) return [word for word in tokens if word not in stop_words] def cosine_similarity(vec1, vec2): intersection = set(vec1.keys()) & set(vec2.keys()) numerator = sum([vec1[x] * vec2[x] for x in intersection]) sum1 = sum([vec1[x]**2 for x in vec1.keys()]) sum2 = sum([vec2[x]**2 for x in vec2.keys()]) denominator = sqrt(sum1) * sqrt(sum2) if not denominator: return 0.0 else: return float(numerator) / denominator def calculate_word_similarity(text1, text2): words1 = preprocess_text(text1) words2 = preprocess_text(text2) vec1 = Counter(words1) vec2 = Counter(words2) similarity = cosine_similarity(vec1, vec2) return similarity * 100 def calculate_sentence_similarity(text1, text2): sentences1 = sent_tokenize(text1) sentences2 = sent_tokenize(text2) similarities = [] for sent1 in sentences1: max_similarity = 0 for sent2 in sentences2: similarity = calculate_word_similarity(sent1, sent2) if similarity > max_similarity: max_similarity = similarity similarities.append(max_similarity) average_similarity = sum(similarities) / len(similarities) if similarities else 0.0 return average_similarity def semantic_similarity(text1, text2): doc1 = nlp(text1) doc2 = nlp(text2) return doc1.similarity(doc2) * 100 def longest_common_subsequence(text1, text2): sentences1 = sent_tokenize(text1) sentences2 = sent_tokenize(text2) m, n = len(sentences1), len(sentences2) L = [[0] * (n + 1) for _ in range(m + 1)] for i in range(1, m + 1): for j in range(1, n + 1): if sentences2[j-1] in sentences1: L[i][j] = L[i-1][j-1] + 1 else: L[i][j] = max(L[i-1][j], L[i][j-1]) lcs = [] i, j = m, n while i > 0 and j > 0: if sentences2[j-1] in sentences1: lcs.append(sentences2[j-1]) i -= 1 j -= 1 elif L[i-1][j] > L[i][j-1]: i -= 1 else: j -= 1 return list(reversed(lcs)) def suggest_rewrites(sentence): words = word_tokenize(sentence) tagged_words = nltk.pos_tag(words) rewrites = [] for word, tag in tagged_words: syns = wordnet.synsets(word) if syns: if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'): synonym = syns[0].lemmas()[0].name() if synonym != word: rewrites.append(synonym) else: rewrites.append(word) else: rewrites.append(word) else: rewrites.append(word) return " ".join(rewrites) def create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score): fig, ax = plt.subplots() ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Semantic Similarity"], [word_similarity, sentence_similarity, semantic_similarity_score], color=["blue", "green", "orange"]) ax.set_ylabel("Percentage") ax.set_ylim(0, 100) ax.set_title("Document Similarity") st.pyplot(fig) def download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches): report = f"Word-Level Similarity: {word_similarity:.2f}%\n" report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n" report += f"Semantic Similarity: {semantic_similarity_score:.2f}%\n\n" report += "Matched Sequences from the Created Document:\n" for i, match in enumerate(matched_sequences, 1): report += f"{i}. {match}\n" report += "\nRewritten Suggestions to Avoid Plagiarism:\n" for i, reworded in enumerate(reworded_matches, 1): report += f"{i}. {reworded}\n" report_bytes = report.encode("utf-8") b64 = base64.b64encode(report_bytes).decode() href = f'Download Report' st.markdown(href, unsafe_allow_html=True) def main(): st.title("High-Accuracy Document Plagiarism Checker") doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"]) doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"]) if doc1 is not None and doc2 is not None: try: text1 = read_file_content(doc1) # Original Document text2 = read_file_content(doc2) # Created Document word_similarity = calculate_word_similarity(text1, text2) sentence_similarity = calculate_sentence_similarity(text1, text2) semantic_similarity_score = semantic_similarity(text1, text2) matched_sequences = longest_common_subsequence(text1, text2) st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%") st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%") st.write(f"Semantic Similarity: {semantic_similarity_score:.2f}%") create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score) if word_similarity < 20: st.write("The created document is mostly original.") elif word_similarity < 50: st.write("There are some similarities between the created and original documents.") else: st.write("The created document has significant similarities with the original and may contain plagiarism.") if matched_sequences: st.subheader("Matched Content from the Created Document:") for i, match in enumerate(matched_sequences, 1): st.write(f"{i}. {match}") # Rewriting the matched content reworded_matches = [suggest_rewrites(match) for match in matched_sequences] st.subheader("Rewritten Suggestions to Avoid Plagiarism:") for i, reworded in enumerate(reworded_matches, 1): st.write(f"{i}. {reworded}") download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches) else: st.write("No significant matched content found from the created document.") except ValueError as e: st.error(f"Error: {str(e)}") if __name__ == "__main__": main()