srikanththirumani's picture
Update app.py
8c61de9 verified
raw
history blame contribute delete
No virus
7.61 kB
import streamlit as st
from docx import Document
import re
from collections import Counter
from math import sqrt
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
import matplotlib.pyplot as plt
import io
import base64
# Load spaCy model for semantic analysis
nlp = spacy.load("en_core_web_md")
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
def read_file_content(uploaded_file):
if uploaded_file.type == "text/plain":
return uploaded_file.getvalue().decode("utf-8")
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = Document(uploaded_file)
return " ".join([paragraph.text for paragraph in doc.paragraphs])
else:
raise ValueError("Unsupported file type")
def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text.lower())
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text)
return [word for word in tokens if word not in stop_words]
def cosine_similarity(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = sqrt(sum1) * sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def calculate_word_similarity(text1, text2):
words1 = preprocess_text(text1)
words2 = preprocess_text(text2)
vec1 = Counter(words1)
vec2 = Counter(words2)
similarity = cosine_similarity(vec1, vec2)
return similarity * 100
def calculate_sentence_similarity(text1, text2):
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
similarities = []
for sent1 in sentences1:
max_similarity = 0
for sent2 in sentences2:
similarity = calculate_word_similarity(sent1, sent2)
if similarity > max_similarity:
max_similarity = similarity
similarities.append(max_similarity)
average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
return average_similarity
def semantic_similarity(text1, text2):
doc1 = nlp(text1)
doc2 = nlp(text2)
return doc1.similarity(doc2) * 100
def longest_common_subsequence(text1, text2):
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
m, n = len(sentences1), len(sentences2)
L = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if sentences2[j-1] in sentences1:
L[i][j] = L[i-1][j-1] + 1
else:
L[i][j] = max(L[i-1][j], L[i][j-1])
lcs = []
i, j = m, n
while i > 0 and j > 0:
if sentences2[j-1] in sentences1:
lcs.append(sentences2[j-1])
i -= 1
j -= 1
elif L[i-1][j] > L[i][j-1]:
i -= 1
else:
j -= 1
return list(reversed(lcs))
def suggest_rewrites(sentence):
words = word_tokenize(sentence)
tagged_words = nltk.pos_tag(words)
rewrites = []
for word, tag in tagged_words:
syns = wordnet.synsets(word)
if syns:
if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
synonym = syns[0].lemmas()[0].name()
if synonym != word:
rewrites.append(synonym)
else:
rewrites.append(word)
else:
rewrites.append(word)
else:
rewrites.append(word)
return " ".join(rewrites)
def create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score):
fig, ax = plt.subplots()
ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Semantic Similarity"],
[word_similarity, sentence_similarity, semantic_similarity_score], color=["blue", "green", "orange"])
ax.set_ylabel("Percentage")
ax.set_ylim(0, 100)
ax.set_title("Document Similarity")
st.pyplot(fig)
def download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches):
report = f"Word-Level Similarity: {word_similarity:.2f}%\n"
report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n"
report += f"Semantic Similarity: {semantic_similarity_score:.2f}%\n\n"
report += "Matched Sequences from the Created Document:\n"
for i, match in enumerate(matched_sequences, 1):
report += f"{i}. {match}\n"
report += "\nRewritten Suggestions to Avoid Plagiarism:\n"
for i, reworded in enumerate(reworded_matches, 1):
report += f"{i}. {reworded}\n"
report_bytes = report.encode("utf-8")
b64 = base64.b64encode(report_bytes).decode()
href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>'
st.markdown(href, unsafe_allow_html=True)
def main():
st.title("High-Accuracy Document Plagiarism Checker")
doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"])
doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"])
if doc1 is not None and doc2 is not None:
try:
text1 = read_file_content(doc1) # Original Document
text2 = read_file_content(doc2) # Created Document
word_similarity = calculate_word_similarity(text1, text2)
sentence_similarity = calculate_sentence_similarity(text1, text2)
semantic_similarity_score = semantic_similarity(text1, text2)
matched_sequences = longest_common_subsequence(text1, text2)
st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%")
st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%")
st.write(f"Semantic Similarity: {semantic_similarity_score:.2f}%")
create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score)
if word_similarity < 20:
st.write("The created document is mostly original.")
elif word_similarity < 50:
st.write("There are some similarities between the created and original documents.")
else:
st.write("The created document has significant similarities with the original and may contain plagiarism.")
if matched_sequences:
st.subheader("Matched Content from the Created Document:")
for i, match in enumerate(matched_sequences, 1):
st.write(f"{i}. {match}")
# Rewriting the matched content
reworded_matches = [suggest_rewrites(match) for match in matched_sequences]
st.subheader("Rewritten Suggestions to Avoid Plagiarism:")
for i, reworded in enumerate(reworded_matches, 1):
st.write(f"{i}. {reworded}")
download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches)
else:
st.write("No significant matched content found from the created document.")
except ValueError as e:
st.error(f"Error: {str(e)}")
if __name__ == "__main__":
main()