|
import streamlit as st |
|
from docx import Document |
|
import re |
|
from collections import Counter |
|
from math import sqrt |
|
import nltk |
|
from nltk.corpus import stopwords, wordnet |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
import spacy |
|
import matplotlib.pyplot as plt |
|
import io |
|
import base64 |
|
|
|
|
|
nlp = spacy.load("en_core_web_md") |
|
|
|
|
|
nltk.download('punkt', quiet=True) |
|
nltk.download('stopwords', quiet=True) |
|
nltk.download('wordnet', quiet=True) |
|
nltk.download('averaged_perceptron_tagger', quiet=True) |
|
|
|
def read_file_content(uploaded_file): |
|
if uploaded_file.type == "text/plain": |
|
return uploaded_file.getvalue().decode("utf-8") |
|
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
|
doc = Document(uploaded_file) |
|
return " ".join([paragraph.text for paragraph in doc.paragraphs]) |
|
else: |
|
raise ValueError("Unsupported file type") |
|
|
|
def preprocess_text(text): |
|
text = re.sub(r'[^\w\s]', '', text.lower()) |
|
stop_words = set(stopwords.words('english')) |
|
tokens = word_tokenize(text) |
|
return [word for word in tokens if word not in stop_words] |
|
|
|
def cosine_similarity(vec1, vec2): |
|
intersection = set(vec1.keys()) & set(vec2.keys()) |
|
numerator = sum([vec1[x] * vec2[x] for x in intersection]) |
|
sum1 = sum([vec1[x]**2 for x in vec1.keys()]) |
|
sum2 = sum([vec2[x]**2 for x in vec2.keys()]) |
|
denominator = sqrt(sum1) * sqrt(sum2) |
|
if not denominator: |
|
return 0.0 |
|
else: |
|
return float(numerator) / denominator |
|
|
|
def calculate_word_similarity(text1, text2): |
|
words1 = preprocess_text(text1) |
|
words2 = preprocess_text(text2) |
|
vec1 = Counter(words1) |
|
vec2 = Counter(words2) |
|
similarity = cosine_similarity(vec1, vec2) |
|
return similarity * 100 |
|
|
|
def calculate_sentence_similarity(text1, text2): |
|
sentences1 = sent_tokenize(text1) |
|
sentences2 = sent_tokenize(text2) |
|
similarities = [] |
|
for sent1 in sentences1: |
|
max_similarity = 0 |
|
for sent2 in sentences2: |
|
similarity = calculate_word_similarity(sent1, sent2) |
|
if similarity > max_similarity: |
|
max_similarity = similarity |
|
similarities.append(max_similarity) |
|
average_similarity = sum(similarities) / len(similarities) if similarities else 0.0 |
|
return average_similarity |
|
|
|
def semantic_similarity(text1, text2): |
|
doc1 = nlp(text1) |
|
doc2 = nlp(text2) |
|
return doc1.similarity(doc2) * 100 |
|
|
|
def longest_common_subsequence(text1, text2): |
|
sentences1 = sent_tokenize(text1) |
|
sentences2 = sent_tokenize(text2) |
|
m, n = len(sentences1), len(sentences2) |
|
L = [[0] * (n + 1) for _ in range(m + 1)] |
|
for i in range(1, m + 1): |
|
for j in range(1, n + 1): |
|
if sentences2[j-1] in sentences1: |
|
L[i][j] = L[i-1][j-1] + 1 |
|
else: |
|
L[i][j] = max(L[i-1][j], L[i][j-1]) |
|
lcs = [] |
|
i, j = m, n |
|
while i > 0 and j > 0: |
|
if sentences2[j-1] in sentences1: |
|
lcs.append(sentences2[j-1]) |
|
i -= 1 |
|
j -= 1 |
|
elif L[i-1][j] > L[i][j-1]: |
|
i -= 1 |
|
else: |
|
j -= 1 |
|
return list(reversed(lcs)) |
|
|
|
def suggest_rewrites(sentence): |
|
words = word_tokenize(sentence) |
|
tagged_words = nltk.pos_tag(words) |
|
rewrites = [] |
|
for word, tag in tagged_words: |
|
syns = wordnet.synsets(word) |
|
if syns: |
|
if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'): |
|
synonym = syns[0].lemmas()[0].name() |
|
if synonym != word: |
|
rewrites.append(synonym) |
|
else: |
|
rewrites.append(word) |
|
else: |
|
rewrites.append(word) |
|
else: |
|
rewrites.append(word) |
|
return " ".join(rewrites) |
|
|
|
def create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score): |
|
fig, ax = plt.subplots() |
|
ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Semantic Similarity"], |
|
[word_similarity, sentence_similarity, semantic_similarity_score], color=["blue", "green", "orange"]) |
|
ax.set_ylabel("Percentage") |
|
ax.set_ylim(0, 100) |
|
ax.set_title("Document Similarity") |
|
st.pyplot(fig) |
|
|
|
def download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches): |
|
report = f"Word-Level Similarity: {word_similarity:.2f}%\n" |
|
report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n" |
|
report += f"Semantic Similarity: {semantic_similarity_score:.2f}%\n\n" |
|
report += "Matched Sequences from the Created Document:\n" |
|
for i, match in enumerate(matched_sequences, 1): |
|
report += f"{i}. {match}\n" |
|
|
|
report += "\nRewritten Suggestions to Avoid Plagiarism:\n" |
|
for i, reworded in enumerate(reworded_matches, 1): |
|
report += f"{i}. {reworded}\n" |
|
|
|
report_bytes = report.encode("utf-8") |
|
b64 = base64.b64encode(report_bytes).decode() |
|
href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
|
|
def main(): |
|
st.title("High-Accuracy Document Plagiarism Checker") |
|
|
|
doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"]) |
|
doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"]) |
|
|
|
if doc1 is not None and doc2 is not None: |
|
try: |
|
text1 = read_file_content(doc1) |
|
text2 = read_file_content(doc2) |
|
|
|
word_similarity = calculate_word_similarity(text1, text2) |
|
sentence_similarity = calculate_sentence_similarity(text1, text2) |
|
semantic_similarity_score = semantic_similarity(text1, text2) |
|
|
|
matched_sequences = longest_common_subsequence(text1, text2) |
|
|
|
st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%") |
|
st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%") |
|
st.write(f"Semantic Similarity: {semantic_similarity_score:.2f}%") |
|
|
|
create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score) |
|
|
|
if word_similarity < 20: |
|
st.write("The created document is mostly original.") |
|
elif word_similarity < 50: |
|
st.write("There are some similarities between the created and original documents.") |
|
else: |
|
st.write("The created document has significant similarities with the original and may contain plagiarism.") |
|
|
|
if matched_sequences: |
|
st.subheader("Matched Content from the Created Document:") |
|
for i, match in enumerate(matched_sequences, 1): |
|
st.write(f"{i}. {match}") |
|
|
|
|
|
reworded_matches = [suggest_rewrites(match) for match in matched_sequences] |
|
|
|
st.subheader("Rewritten Suggestions to Avoid Plagiarism:") |
|
for i, reworded in enumerate(reworded_matches, 1): |
|
st.write(f"{i}. {reworded}") |
|
|
|
download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches) |
|
else: |
|
st.write("No significant matched content found from the created document.") |
|
|
|
except ValueError as e: |
|
st.error(f"Error: {str(e)}") |
|
|
|
if __name__ == "__main__": |
|
main() |