scholarly360 commited on
Commit
082ecbe
·
verified ·
1 Parent(s): 8a9fce6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from annotated_text import annotated_text, annotation
3
+ import fitz
4
+ import os
5
+ import chromadb
6
+ import uuid
7
+ from pathlib import Path
8
+ import os
9
+ st.title("Contracts Classification ")
10
+ import pandas as pd
11
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
12
+ from langchain.schema import Document
13
+ from langchain.vectorstores import Chroma
14
+ from langchain.embeddings import HuggingFaceEmbeddings
15
+
16
+
17
+ def util_upload_file_and_return_list_docs(uploaded_files):
18
+ #util_del_cwd()
19
+ list_docs = []
20
+ list_save_path = []
21
+ for uploaded_file in uploaded_files:
22
+ save_path = Path(os.getcwd(), uploaded_file.name)
23
+ with open(save_path, mode='wb') as w:
24
+ w.write(uploaded_file.getvalue())
25
+ #print('save_path:', save_path)
26
+ docs = fitz.open(save_path)
27
+ list_docs.append(docs)
28
+ list_save_path.append(save_path)
29
+ return(list_docs, list_save_path)
30
+ #### Helper Functions to Split using Rolling Window (recomm : use smaller rolling window )
31
+ def split_txt_file_synthetic_sentence_rolling(ctxt, sentence_size_in_chars, sliding_size_in_chars,debug=False):
32
+ sliding_size_in_chars = sentence_size_in_chars - sliding_size_in_chars
33
+ pos_start = 0
34
+ pos_end = len(ctxt)
35
+ final_return = []
36
+ if(debug):
37
+ print('pos_start : ',pos_start)
38
+ print('pos_end : ',pos_end)
39
+ if(pos_end<sentence_size_in_chars):
40
+ return([{'section_org_text':ctxt[pos_start:pos_end],'section_char_start':pos_start,'section_char_end':pos_end}])
41
+ if(sentence_size_in_chars<sliding_size_in_chars):
42
+ return(None)
43
+ stop_condition = False
44
+ start = pos_start
45
+ end = start + sentence_size_in_chars
46
+ mydict = {}
47
+ mydict['section_org_text'] = ctxt[start:end]
48
+ mydict['section_char_start'] = start
49
+ mydict['section_char_end'] = end
50
+ final_return.append(mydict)
51
+ #### First Time ENDS
52
+ while(stop_condition==False):
53
+ start = end - sliding_size_in_chars
54
+ end = start + sentence_size_in_chars
55
+ if(end>pos_end):
56
+ if(start<pos_end):
57
+ end = pos_end
58
+ mydict = {}
59
+ mydict['section_org_text'] = ctxt[start:end]
60
+ mydict['section_char_start'] = start
61
+ mydict['section_char_end'] = end
62
+ final_return.append(mydict)
63
+ stop_condition=True
64
+ else:
65
+ stop_condition=True
66
+ else:
67
+ mydict = {}
68
+ mydict['section_org_text'] = ctxt[start:end]
69
+ mydict['section_char_start'] = start
70
+ mydict['section_char_end'] = end
71
+ final_return.append(mydict)
72
+ if(debug):
73
+ print('start : ', start)
74
+ print('end : ', end)
75
+ return(final_return)
76
+
77
+
78
+ def util_get_list_page_and_passage(list_docs, list_save_path):
79
+ #page_documents = []
80
+ passage_documents = []
81
+ for ind_doc, docs in enumerate(list_docs):
82
+ for txt_index, txt_page in enumerate(docs):
83
+ page_document = txt_page.get_text()##.encode("utf8") # get plain text (is in UTF-8)
84
+ #page_documents.append(page_document)
85
+ sections = split_into_sentences_with_offsets(page_document)
86
+ for sub_sub_index, sub_sub_item in enumerate(sections):
87
+ sub_text=sub_sub_item[0]
88
+ passage_document = Document(page_content=sub_text, metadata={"page_content": page_document,"page_index": txt_index, "file_name" : str(list_save_path[ind_doc])})
89
+ passage_documents.append(passage_document)
90
+ return(passage_documents)
91
+
92
+ # def util_index_chromadb_passages():
93
+ # ##### PROCESSING
94
+ # # create client and a new collection
95
+ # collection_name = str(uuid.uuid4().hex)
96
+ # chroma_client = chromadb.EphemeralClient()
97
+ # chroma_collection = chroma_client.get_or_create_collection(collection_name)
98
+ # # define embedding function
99
+ # embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="BAAI/bge-small-en"))
100
+ # vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
101
+ # return(chroma_client,chroma_collection,collection_name,vector_store,embed_model)
102
+
103
+ def util_get_only_content_inside_loop(page_no,page_documents):
104
+ for index, item in enumerate(page_documents):
105
+ if(page_documents[index].metadata['txt_page_index']==page_no):
106
+ return(page_documents[index].get_content())
107
+ return(None)
108
+
109
+ passage_documents = []
110
+
111
+ with st.form("my_form"):
112
+ multi = '''1. Download and Upload Multiple contracts
113
+
114
+ e.g. https://www.barc.gov.in/tenders/GCC-LPS.pdf
115
+
116
+ e.g. https://www.montrosecounty.net/DocumentCenter/View/823/Sample-Construction-Contract
117
+ '''
118
+ st.markdown(multi)
119
+ multi = '''2. Insert Query to search or find similar language '''
120
+ st.markdown(multi)
121
+ multi = '''3. Press Index.'''
122
+ st.markdown(multi)
123
+ multi = '''
124
+ ** Attempt is made for appropriate page and passage retrieval ** \n
125
+ '''
126
+ st.markdown(multi)
127
+ #uploaded_file = st.file_uploader("Choose a file")
128
+
129
+ list_docs = []
130
+ list_save_path = []
131
+ uploaded_files = st.file_uploader("Choose file(s)", accept_multiple_files=True)
132
+ print('uploaded_files ', uploaded_files)
133
+ submitted = st.form_submit_button("Index and Calculate")
134
+
135
+ if submitted and (uploaded_files is not None):
136
+ list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
137
+ # print('list_docs ' ,list_docs)
138
+ # print('list_save_path ' , list_save_path)
139
+ passage_documents = util_get_list_page_and_passage(list_docs, list_save_path)