ashishanand commited on
Commit
026406c
·
1 Parent(s): f3f65fe

citation improved

Browse files
Files changed (1) hide show
  1. app.py +55 -7
app.py CHANGED
@@ -1,7 +1,7 @@
1
  # app.py
2
 
3
  import os
4
- # import re
5
  import torch
6
  # import pdfplumber
7
  from chromadb.utils import embedding_functions
@@ -18,6 +18,50 @@ groq_api_key = os.environ.get('GROQ_API_KEY')
18
  chat_client = Groq(api_key=groq_api_key)
19
  model = "llama-3.2-90b-text-preview"
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # def parse_pdf(pdf_path):
23
 
@@ -153,9 +197,11 @@ def process_query(query):
153
  return "No relevant information found in the manual."
154
 
155
  # Extract chunks and metadata
156
- chunks = results['documents'][0]
157
  metadatas = results['metadatas'][0]
158
 
 
 
159
  reranked_chunks = colbert_rerank(query, chunks)
160
  final_context = " ".join(reranked_chunks[:10])
161
 
@@ -168,14 +214,16 @@ def process_query(query):
168
  if last_complete != -1:
169
  answer = answer[:last_complete + 1].strip()
170
 
 
171
  # Prepare citations
172
- citations = [
173
- f"Page {meta.get('page_number', 'N/A')}" for meta in metadatas[:5]
174
- ]
175
 
176
- citations_text = "Pages cited from:\n" + "\n".join(citations)
177
 
178
- return f"{answer}\n\n{citations_text}"
 
179
 
180
  # Initialize global variables
181
  def initialize():
 
1
  # app.py
2
 
3
  import os
4
+ import re
5
  import torch
6
  # import pdfplumber
7
  from chromadb.utils import embedding_functions
 
18
  chat_client = Groq(api_key=groq_api_key)
19
  model = "llama-3.2-90b-text-preview"
20
 
21
+ def edit_text(text):
22
+ # Find all citations and their positions
23
+ citation_matches = list(re.finditer(r'\[(\d+)\]', text))
24
+
25
+ # List to store indices of citations to remove
26
+ indices_to_remove = []
27
+
28
+ prev_num = None
29
+ prev_index = None
30
+
31
+ # Identify consecutive duplicate citations
32
+ for i in range(len(citation_matches)):
33
+ current_citation = citation_matches[i]
34
+ current_num = current_citation.group(1)
35
+
36
+ if prev_num == current_num:
37
+ # Mark the previous citation for removal
38
+ indices_to_remove.append(prev_index)
39
+ prev_num = current_num
40
+ prev_index = i
41
+
42
+ # Reconstruct the text with modifications
43
+ output_parts = []
44
+ last_end = 0
45
+
46
+ for i in range(len(citation_matches)):
47
+ m = citation_matches[i]
48
+ start, end = m.span()
49
+ if i in indices_to_remove:
50
+ # Remove citation
51
+ output_parts.append(text[last_end:start])
52
+ else:
53
+ # Keep and modify citation
54
+ output_parts.append(text[last_end:start])
55
+ page_num = m.group(1)
56
+ new_citation = '[Page ' + page_num + ']'
57
+ output_parts.append(new_citation)
58
+ last_end = end
59
+
60
+ # Append any remaining text after the last citation
61
+ output_parts.append(text[last_end:])
62
+
63
+ modified_text = ''.join(output_parts)
64
+ return modified_text
65
 
66
  # def parse_pdf(pdf_path):
67
 
 
197
  return "No relevant information found in the manual."
198
 
199
  # Extract chunks and metadata
200
+ pre_chunks = results['documents'][0]
201
  metadatas = results['metadatas'][0]
202
 
203
+ chunks = [f'Page {y["page_number"]}:: {x}' for x,y in zip(pre_chunks,metadatas)]
204
+
205
  reranked_chunks = colbert_rerank(query, chunks)
206
  final_context = " ".join(reranked_chunks[:10])
207
 
 
214
  if last_complete != -1:
215
  answer = answer[:last_complete + 1].strip()
216
 
217
+ answer = edit_text(answer)
218
  # Prepare citations
219
+ # citations = [
220
+ # f"Page {meta.get('page_number', 'N/A')}" for meta in metadatas[:5]
221
+ # ]
222
 
223
+ # citations_text = "Pages cited from:\n" + "\n".join(citations)
224
 
225
+ # return f"{answer}\n\n{citations_text}"
226
+ return answer
227
 
228
  # Initialize global variables
229
  def initialize():