File size: 6,562 Bytes
6e04d22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import PyPDF2
from docx import Document
from nltk.corpus import wordnet as wn
import nltk
import pandas as pd
# Ensure required resources are downloaded'wordnet')'omw-1.4')
# Load the tokenizer and model for sentence embeddings
def load_model():
tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Smaller, faster sentence embeddings model
st.success("Model loaded successfully!")
return tokenizer, model, sentence_model
except Exception as e:
st.error(f"Error loading models: {e}")
return None, None, None
# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error reading PDF: {e}")
return ""
# Extract text from a Word document
def extract_text_from_word(docx_file):
doc = Document(docx_file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
st.error(f"Error reading Word document: {e}")
return ""
# Optimized comparison using embeddings and matrix operations
def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
# Encode all sentences in batches to get embeddings
doc1_embeddings = sentence_model.encode(doc1_sentences, convert_to_tensor=True, batch_size=16)
doc2_embeddings = sentence_model.encode(doc2_sentences, convert_to_tensor=True, batch_size=16)
# Compute cosine similarity matrix between all pairs
similarity_matrix = util.pytorch_cos_sim(doc1_embeddings, doc2_embeddings)
# Extract pairs with similarity > threshold
threshold = 0.6 # Adjust this for stricter or looser matching
similar_sentences = []
for i, row in enumerate(similarity_matrix):
for j, score in enumerate(row):
if score >= threshold:
similar_sentences.append((i, j, score.item(), doc1_sentences[i], doc2_sentences[j]))
return similar_sentences
# Find similar words or synonyms between two sentences
def find_similar_words(sentence1, sentence2):
words1 = set(sentence1.split())
words2 = set(sentence2.split())
similar_words = []
for word1 in words1:
for word2 in words2:
if word1 == word2 or is_synonym(word1, word2):
similar_words.append((word1, word2))
return similar_words
# Check if two words are synonyms using WordNet
def is_synonym(word1, word2):
synonyms_word1 = set( for synset in wn.synsets(word1) for lemma in synset.lemmas())
synonyms_word2 = set( for synset in wn.synsets(word2) for lemma in synset.lemmas())
return len(synonyms_word1.intersection(synonyms_word2)) > 0
# Streamlit UI
def main():
st.title("Enhanced Comparative Analysis of Two Documents")
st.sidebar.header("Upload Files")
# Upload files
uploaded_file1 = st.sidebar.file_uploader("Upload the First Document (PDF/Word)", type=["pdf", "docx"])
uploaded_file2 = st.sidebar.file_uploader("Upload the Second Document (PDF/Word)", type=["pdf", "docx"])
if uploaded_file1 and uploaded_file2:
# Extract text from the uploaded documents
text1 = extract_text_from_pdf(uploaded_file1)
text1 = extract_text_from_word(uploaded_file1)
text2 = extract_text_from_pdf(uploaded_file2)
text2 = extract_text_from_word(uploaded_file2)
if not text1.strip():
st.error("The first document is empty or could not be read.")
if not text2.strip():
st.error("The second document is empty or could not be read.")
st.write("### Preview of Document 1:")
st.text(text1[:500]) # Display a preview of Document 1
st.write("### Preview of Document 2:")
st.text(text2[:500]) # Display a preview of Document 2
# Split text into sentences
doc1_sentences = text1.split('. ')
doc2_sentences = text2.split('. ')
# Limit sentences for testing purposes (optional)
doc1_sentences = doc1_sentences[:50] # Remove this line for full processing
doc2_sentences = doc2_sentences[:50] # Remove this line for full processing
# Load models
tokenizer, model, sentence_model = load_model()
if not sentence_model:
st.error("Failed to load the sentence embedding model.")
# Perform sentence comparison"Comparing sentences, this may take a moment...")
similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)
# Display results
st.header("Comparative Analysis Results")
st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")
if similar_sentences:
st.success(f"Found {len(similar_sentences)} similar sentences!")
# Prepare table for similar words
table_data = []
for match in similar_sentences:
doc1_index, doc2_index, score, sent1, sent2 = match
similar_words = find_similar_words(sent1, sent2)
similar_words_str = ", ".join([f"({w1}, {w2})" for w1, w2 in similar_words])
table_data.append([f"Sentence {doc1_index + 1}", f"Sentence {doc2_index + 1}", score, similar_words_str])
# Create a DataFrame for display
comparison_df = pd.DataFrame(table_data, columns=["Document 1 Sentence", "Document 2 Sentence", "Similarity Score", "Similar Words/Synonyms"])
else:"No significantly similar sentences found.")
st.warning("Please upload two documents to compare.")
if __name__ == "__main__":