awacke1 commited on
Commit
d3641f1
·
verified ·
1 Parent(s): 0f95949

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -45
app.py CHANGED
@@ -299,55 +299,46 @@ def save_full_transcript(query, text):
299
  """Save full transcript of Arxiv results as a file."""
300
  create_file(query, text, "md")
301
 
302
- # ------------------------------
303
- # NEW: Helper to parse references
304
- # ------------------------------
305
  def parse_arxiv_refs(ref_text: str):
306
  """
307
- Parse the multi-line references returned by the RAG pipeline.
308
- Typical format lines like:
309
- 1) [Paper Title 2023] This is the summary ...
310
- 2) [Another Title (2024)] Another summary text ...
311
- We'll attempt to find a year with a small regex or fallback.
312
- Return list of dicts: { 'title': str, 'summary': str, 'year': int or None }
313
  """
314
- lines = ref_text.split('\n')
 
 
 
315
  results = []
316
- for line in lines:
317
- line = line.strip()
318
- if not line:
319
- continue
320
- # Attempt to find [Title ...]
321
- title_match = re.search(r"\[([^\]]+)\]", line)
322
- if title_match:
323
- raw_title = title_match.group(1).strip()
324
- else:
325
- # If no bracket found, skip or treat entire line as summary
326
- raw_title = "No Title"
327
-
328
- # Attempt to find trailing summary after bracket
329
- # Example line: " [Paper Title 2024] Paper summary blah blah"
330
- # So remove the bracketed portion from the line
331
- remainder = line.replace(title_match.group(0), "").strip() if title_match else line
332
- summary = remainder
333
-
334
- # Attempt to guess year from the raw title
335
- # We look for 4-digit patterns in raw_title or summary
336
- year_match = re.search(r'(20\d{2})', raw_title)
337
- if not year_match:
338
- # fallback: try summary
339
- year_match = re.search(r'(20\d{2})', summary)
340
- if year_match:
341
- year = int(year_match.group(1))
342
- else:
343
- year = None
344
-
345
- results.append({
346
- 'title': raw_title,
347
- 'summary': summary,
348
- 'year': year
349
- })
350
- return results
351
 
352
 
353
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
 
299
  """Save full transcript of Arxiv results as a file."""
300
  create_file(query, text, "md")
301
 
 
 
 
302
  def parse_arxiv_refs(ref_text: str):
303
  """
304
+ Parse paper references with format:
305
+ **DATE | TITLE | ⬇️**
306
+ AUTHORS
307
+ SUMMARY
308
+
309
+ Returns list of dicts with paper details, limited to 20 papers.
310
  """
311
+ # Split on the paper header pattern
312
+ papers = re.split(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
313
+ headers = re.findall(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
314
+
315
  results = []
316
+ for i, (header, content) in enumerate(zip(headers, papers[1:])):
317
+ if i >= 20: # Limit to 20 papers
318
+ break
319
+
320
+ # Parse header parts
321
+ header_parts = [p.strip() for p in header.strip('*').split('|')]
322
+ if len(header_parts) >= 2:
323
+ date_str = header_parts[0].strip()
324
+ title = header_parts[1].strip()
325
+
326
+ # Parse content into authors and summary
327
+ content_parts = content.strip().split('\n', 1)
328
+ authors = content_parts[0].strip('*') if content_parts else ""
329
+ summary = content_parts[1].strip() if len(content_parts) > 1 else ""
330
+
331
+ # Extract year from date
332
+ year_match = re.search(r'20\d{2}', date_str)
333
+ year = int(year_match.group(0)) if year_match else None
334
+
335
+ results.append({
336
+ 'title': title,
337
+ 'summary': summary,
338
+ 'authors': authors,
339
+ 'year': year,
340
+ 'date': date_str
341
+ })
 
 
 
 
 
 
 
 
 
342
 
343
 
344
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,