import streamlit as st
from docx import Document
import re
from collections import Counter
from math import sqrt
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
import matplotlib.pyplot as plt
import io
import base64

# Load spaCy model for semantic analysis
nlp = spacy.load("en_core_web_md")

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

def read_file_content(uploaded_file):
    if uploaded_file.type == "text/plain":
        return uploaded_file.getvalue().decode("utf-8")
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = Document(uploaded_file)
        return " ".join([paragraph.text for paragraph in doc.paragraphs])
    else:
        raise ValueError("Unsupported file type")

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

def cosine_similarity(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = sqrt(sum1) * sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def calculate_word_similarity(text1, text2):
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)
    vec1 = Counter(words1)
    vec2 = Counter(words2)
    similarity = cosine_similarity(vec1, vec2)
    return similarity * 100

def calculate_sentence_similarity(text1, text2):
    sentences1 = sent_tokenize(text1)
    sentences2 = sent_tokenize(text2)
    similarities = []
    for sent1 in sentences1:
        max_similarity = 0
        for sent2 in sentences2:
            similarity = calculate_word_similarity(sent1, sent2)
            if similarity > max_similarity:
                max_similarity = similarity
        similarities.append(max_similarity)
    average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
    return average_similarity

def semantic_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2) * 100

def longest_common_subsequence(text1, text2):
    sentences1 = sent_tokenize(text1)
    sentences2 = sent_tokenize(text2)
    m, n = len(sentences1), len(sentences2)
    L = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if sentences2[j-1] in sentences1:
                L[i][j] = L[i-1][j-1] + 1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
    lcs = []
    i, j = m, n
    while i > 0 and j > 0:
        if sentences2[j-1] in sentences1:
            lcs.append(sentences2[j-1])
            i -= 1
            j -= 1
        elif L[i-1][j] > L[i][j-1]:
            i -= 1
        else:
            j -= 1
    return list(reversed(lcs))

def suggest_rewrites(sentence):
    words = word_tokenize(sentence)
    tagged_words = nltk.pos_tag(words)
    rewrites = []
    for word, tag in tagged_words:
        syns = wordnet.synsets(word)
        if syns:
            if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'):
                synonym = syns[0].lemmas()[0].name()
                if synonym != word:
                    rewrites.append(synonym)
                else:
                    rewrites.append(word)
            else:
                rewrites.append(word)
        else:
            rewrites.append(word)
    return " ".join(rewrites)

def create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score):
    fig, ax = plt.subplots()
    ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Semantic Similarity"], 
           [word_similarity, sentence_similarity, semantic_similarity_score], color=["blue", "green", "orange"])
    ax.set_ylabel("Percentage")
    ax.set_ylim(0, 100)
    ax.set_title("Document Similarity")
    st.pyplot(fig)

def download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches):
    report = f"Word-Level Similarity: {word_similarity:.2f}%\n"
    report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n"
    report += f"Semantic Similarity: {semantic_similarity_score:.2f}%\n\n"
    report += "Matched Sequences from the Created Document:\n"
    for i, match in enumerate(matched_sequences, 1):
        report += f"{i}. {match}\n"
    
    report += "\nRewritten Suggestions to Avoid Plagiarism:\n"
    for i, reworded in enumerate(reworded_matches, 1):
        report += f"{i}. {reworded}\n"

    report_bytes = report.encode("utf-8")
    b64 = base64.b64encode(report_bytes).decode()
    href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>'
    st.markdown(href, unsafe_allow_html=True)

def main():
    st.title("High-Accuracy Document Plagiarism Checker")

    doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"])
    doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"])

    if doc1 is not None and doc2 is not None:
        try:
            text1 = read_file_content(doc1)  # Original Document
            text2 = read_file_content(doc2)  # Created Document

            word_similarity = calculate_word_similarity(text1, text2)
            sentence_similarity = calculate_sentence_similarity(text1, text2)
            semantic_similarity_score = semantic_similarity(text1, text2)

            matched_sequences = longest_common_subsequence(text1, text2)

            st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%")
            st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%")
            st.write(f"Semantic Similarity: {semantic_similarity_score:.2f}%")

            create_bar_chart(word_similarity, sentence_similarity, semantic_similarity_score)

            if word_similarity < 20:
                st.write("The created document is mostly original.")
            elif word_similarity < 50:
                st.write("There are some similarities between the created and original documents.")
            else:
                st.write("The created document has significant similarities with the original and may contain plagiarism.")

            if matched_sequences:
                st.subheader("Matched Content from the Created Document:")
                for i, match in enumerate(matched_sequences, 1):
                    st.write(f"{i}. {match}")
                
                # Rewriting the matched content
                reworded_matches = [suggest_rewrites(match) for match in matched_sequences]

                st.subheader("Rewritten Suggestions to Avoid Plagiarism:")
                for i, reworded in enumerate(reworded_matches, 1):
                    st.write(f"{i}. {reworded}")
                
                download_report(word_similarity, sentence_similarity, semantic_similarity_score, matched_sequences, reworded_matches)
            else:
                st.write("No significant matched content found from the created document.")

        except ValueError as e:
            st.error(f"Error: {str(e)}")

if __name__ == "__main__":
    main()