File size: 6,562 Bytes
6e04d22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import PyPDF2
from docx import Document
from nltk.corpus import wordnet as wn
import nltk
import pandas as pd

# Ensure required resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the tokenizer and model for sentence embeddings
@st.cache_resource
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
        model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller, faster sentence embeddings model
        st.success("Model loaded successfully!")
        return tokenizer, model, sentence_model
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return None, None, None

# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error reading PDF: {e}")
        return ""

# Extract text from a Word document
def extract_text_from_word(docx_file):
    try:
        doc = Document(docx_file)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    except Exception as e:
        st.error(f"Error reading Word document: {e}")
        return ""

# Optimized comparison using embeddings and matrix operations
def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
    # Encode all sentences in batches to get embeddings
    doc1_embeddings = sentence_model.encode(doc1_sentences, convert_to_tensor=True, batch_size=16)
    doc2_embeddings = sentence_model.encode(doc2_sentences, convert_to_tensor=True, batch_size=16)

    # Compute cosine similarity matrix between all pairs
    similarity_matrix = util.pytorch_cos_sim(doc1_embeddings, doc2_embeddings)

    # Extract pairs with similarity > threshold
    threshold = 0.6  # Adjust this for stricter or looser matching
    similar_sentences = []

    for i, row in enumerate(similarity_matrix):
        for j, score in enumerate(row):
            if score >= threshold:
                similar_sentences.append((i, j, score.item(), doc1_sentences[i], doc2_sentences[j]))

    return similar_sentences

# Find similar words or synonyms between two sentences
def find_similar_words(sentence1, sentence2):
    words1 = set(sentence1.split())
    words2 = set(sentence2.split())
    similar_words = []

    for word1 in words1:
        for word2 in words2:
            if word1 == word2 or is_synonym(word1, word2):
                similar_words.append((word1, word2))

    return similar_words

# Check if two words are synonyms using WordNet
def is_synonym(word1, word2):
    synonyms_word1 = set(lemma.name() for synset in wn.synsets(word1) for lemma in synset.lemmas())
    synonyms_word2 = set(lemma.name() for synset in wn.synsets(word2) for lemma in synset.lemmas())
    return len(synonyms_word1.intersection(synonyms_word2)) > 0

# Streamlit UI
def main():
    st.title("Enhanced Comparative Analysis of Two Documents")
    st.sidebar.header("Upload Files")

    # Upload files
    uploaded_file1 = st.sidebar.file_uploader("Upload the First Document (PDF/Word)", type=["pdf", "docx"])
    uploaded_file2 = st.sidebar.file_uploader("Upload the Second Document (PDF/Word)", type=["pdf", "docx"])

    if uploaded_file1 and uploaded_file2:
        # Extract text from the uploaded documents
        if uploaded_file1.name.endswith(".pdf"):
            text1 = extract_text_from_pdf(uploaded_file1)
        else:
            text1 = extract_text_from_word(uploaded_file1)

        if uploaded_file2.name.endswith(".pdf"):
            text2 = extract_text_from_pdf(uploaded_file2)
        else:
            text2 = extract_text_from_word(uploaded_file2)

        if not text1.strip():
            st.error("The first document is empty or could not be read.")
            return
        if not text2.strip():
            st.error("The second document is empty or could not be read.")
            return

        st.write("### Preview of Document 1:")
        st.text(text1[:500])  # Display a preview of Document 1
        st.write("### Preview of Document 2:")
        st.text(text2[:500])  # Display a preview of Document 2

        # Split text into sentences
        doc1_sentences = text1.split('. ')
        doc2_sentences = text2.split('. ')

        # Limit sentences for testing purposes (optional)
        doc1_sentences = doc1_sentences[:50]  # Remove this line for full processing
        doc2_sentences = doc2_sentences[:50]  # Remove this line for full processing

        # Load models
        tokenizer, model, sentence_model = load_model()
        if not sentence_model:
            st.error("Failed to load the sentence embedding model.")
            return

        # Perform sentence comparison
        st.info("Comparing sentences, this may take a moment...")
        similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)

        # Display results
        st.header("Comparative Analysis Results")
        st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
        st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")

        if similar_sentences:
            st.success(f"Found {len(similar_sentences)} similar sentences!")
            
            # Prepare table for similar words
            table_data = []
            for match in similar_sentences:
                doc1_index, doc2_index, score, sent1, sent2 = match
                similar_words = find_similar_words(sent1, sent2)
                similar_words_str = ", ".join([f"({w1}, {w2})" for w1, w2 in similar_words])
                table_data.append([f"Sentence {doc1_index + 1}", f"Sentence {doc2_index + 1}", score, similar_words_str])

            # Create a DataFrame for display
            comparison_df = pd.DataFrame(table_data, columns=["Document 1 Sentence", "Document 2 Sentence", "Similarity Score", "Similar Words/Synonyms"])
            st.table(comparison_df)
        else:
            st.info("No significantly similar sentences found.")
    else:
        st.warning("Please upload two documents to compare.")

if __name__ == "__main__":
    main()