awacke1 commited on
Commit
e90888f
·
verified ·
1 Parent(s): ad5d2a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -81
app.py CHANGED
@@ -301,106 +301,143 @@ def save_full_transcript(query, text):
301
 
302
  def parse_arxiv_refs(ref_text: str):
303
  """
304
- Parse paper references with format:
305
- **DATE | TITLE | ⬇️**
306
- AUTHORS
307
- SUMMARY
308
-
309
- Returns list of dicts with paper details, limited to 20 papers.
310
- Returns empty list if parsing fails.
311
  """
312
- try:
313
- if not ref_text:
314
- return []
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- # Split on the paper header pattern
317
- papers = re.split(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
318
- headers = re.findall(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
319
-
320
- results = []
321
- for i, (header, content) in enumerate(zip(headers, papers[1:])):
322
- if i >= 20: # Limit to 20 papers
323
- break
324
-
325
  try:
326
- # Parse header parts
327
- header_parts = [p.strip() for p in header.strip('*').split('|')]
328
- if len(header_parts) >= 2:
329
- date_str = header_parts[0].strip()
330
- title = header_parts[1].strip()
331
-
332
- # Parse content into authors and summary
333
- content_parts = content.strip().split('\n', 1)
334
- authors = content_parts[0].strip('*') if content_parts else ""
335
- summary = content_parts[1].strip() if len(content_parts) > 1 else ""
336
-
337
- # Extract year from date
338
- year_match = re.search(r'20\d{2}', date_str)
339
- year = int(year_match.group(0)) if year_match else None
340
-
341
- results.append({
342
- 'title': title,
343
- 'summary': summary,
344
- 'authors': authors,
345
- 'year': year,
346
- 'date': date_str
347
- })
348
  except Exception as e:
349
- st.warning(f"Error parsing paper {i+1}: {str(e)}")
 
350
  continue
351
-
352
- return results
353
- except Exception as e:
354
- st.error(f"Error parsing papers: {str(e)}")
355
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
358
- titles_summary=True, full_audio=False):
359
- """Perform Arxiv search and generate audio summaries."""
360
  start = time.time()
361
 
362
  # Query the HF RAG pipeline
363
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
364
- refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
365
- r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
 
 
 
366
 
367
  # Combine for final text output
368
  result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
369
  st.markdown(result)
370
 
371
- # Parse references
372
- parsed_refs = parse_arxiv_refs(refs)
373
-
374
- # Sort only if we have results
375
- if parsed_refs:
376
- parsed_refs.sort(key=lambda x: x.get("year", 0) if x.get("year") else 0, reverse=True)
377
-
378
- # Display papers
379
- st.write("## Research Papers")
380
- for idx, paper in enumerate(parsed_refs):
381
- st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
382
- st.markdown(f"*{paper['authors']}*")
383
- st.markdown(paper['summary'])
384
-
385
- # Audio controls
386
- colA, colB = st.columns(2)
387
- with colA:
388
- if st.button(f"🔊 Title", key=f"title_{idx}"):
389
- text_tts = clean_for_speech(paper['title'])
390
- audio_file_title = speak_with_edge_tts(text_tts)
391
- play_and_download_audio(audio_file_title)
392
-
393
- with colB:
394
- if st.button(f"🔊 Full Details", key=f"summary_{idx}"):
395
- text_tts = clean_for_speech(f"{paper['title']} by {paper['authors']}. {paper['summary']}")
396
- audio_file_title_summary = speak_with_edge_tts(text_tts)
397
- play_and_download_audio(audio_file_title_summary)
398
 
399
- st.write("---")
400
-
401
- # Rest of your existing function...
402
  elapsed = time.time()-start
403
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
 
 
404
  create_file(q, result, "md")
405
  return result
406
 
 
301
 
302
  def parse_arxiv_refs(ref_text: str):
303
  """
304
+ Parse papers by finding lines with two pipe characters as title lines.
305
+ Returns list of paper dictionaries with audio files.
 
 
 
 
 
306
  """
307
+ if not ref_text:
308
+ return []
309
+
310
+ results = []
311
+ current_paper = {}
312
+ lines = ref_text.split('\n')
313
+
314
+ for i, line in enumerate(lines):
315
+ # Check if this is a title line (contains exactly 2 pipe characters)
316
+ if line.count('|') == 2:
317
+ # If we have a previous paper, add it to results
318
+ if current_paper:
319
+ results.append(current_paper)
320
+ if len(results) >= 20: # Limit to 20 papers
321
+ break
322
 
323
+ # Parse new paper header
 
 
 
 
 
 
 
 
324
  try:
325
+ # Remove ** and split by |
326
+ header_parts = line.strip('* ').split('|')
327
+ date = header_parts[0].strip()
328
+ title = header_parts[1].strip()
329
+ # Extract arXiv URL if present
330
+ url_match = re.search(r'(https://arxiv.org/\S+)', line)
331
+ url = url_match.group(1) if url_match else f"paper_{len(results)}"
332
+
333
+ current_paper = {
334
+ 'date': date,
335
+ 'title': title,
336
+ 'url': url,
337
+ 'authors': '',
338
+ 'summary': '',
339
+ 'content_start': i + 1 # Track where content begins
340
+ }
 
 
 
 
 
 
341
  except Exception as e:
342
+ st.warning(f"Error parsing paper header: {str(e)}")
343
+ current_paper = {}
344
  continue
345
+
346
+ # If we have a current paper and this isn't a title line, add to content
347
+ elif current_paper:
348
+ if not current_paper['authors']: # First line after title is authors
349
+ current_paper['authors'] = line.strip('* ')
350
+ else: # Rest is summary
351
+ if current_paper['summary']:
352
+ current_paper['summary'] += ' ' + line.strip()
353
+ else:
354
+ current_paper['summary'] = line.strip()
355
+
356
+ # Don't forget the last paper
357
+ if current_paper:
358
+ results.append(current_paper)
359
+
360
+ return results[:20] # Ensure we return maximum 20 papers
361
+
362
+ def create_paper_audio_files(papers):
363
+ """
364
+ Create audio files for each paper's components and add file paths to paper dict.
365
+ """
366
+ for paper in papers:
367
+ try:
368
+ # Generate audio for title
369
+ title_text = clean_for_speech(paper['title'])
370
+ title_file = speak_with_edge_tts(title_text)
371
+ paper['title_audio'] = title_file
372
+
373
+ # Generate audio for full content
374
+ full_text = f"{paper['title']} by {paper['authors']}. {paper['summary']}"
375
+ full_text = clean_for_speech(full_text)
376
+ full_file = speak_with_edge_tts(full_text)
377
+ paper['full_audio'] = full_file
378
+
379
+ except Exception as e:
380
+ st.warning(f"Error generating audio for paper {paper['title']}: {str(e)}")
381
+ paper['title_audio'] = None
382
+ paper['full_audio'] = None
383
+
384
+ def display_papers(papers):
385
+ """
386
+ Display papers with their audio controls using URLs as unique keys.
387
+ """
388
+ st.write("## Research Papers")
389
+
390
+ for idx, paper in enumerate(papers):
391
+ with st.expander(f"📄 {paper['title']}", expanded=True):
392
+ st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
393
+ st.markdown(f"*{paper['authors']}*")
394
+ st.markdown(paper['summary'])
395
+
396
+ # Audio controls in columns
397
+ col1, col2 = st.columns(2)
398
+
399
+ # Use URL as unique key for audio interface
400
+ key_base = paper['url'].split('/')[-1] if paper['url'].startswith('http') else paper['url']
401
+
402
+ with col1:
403
+ if paper.get('title_audio'):
404
+ st.write("🎙️ Title Audio")
405
+ st.audio(paper['title_audio'], key=f"title_{key_base}")
406
+
407
+ with col2:
408
+ if paper.get('full_audio'):
409
+ st.write("📚 Full Paper Audio")
410
+ st.audio(paper['full_audio'], key=f"full_{key_base}")
411
 
412
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
413
+ titles_summary=True, full_audio=False):
414
+ """Perform Arxiv search with audio generation per paper."""
415
  start = time.time()
416
 
417
  # Query the HF RAG pipeline
418
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
419
+ refs = client.predict(q, 20, "Semantic Search",
420
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
421
+ api_name="/update_with_rag_md")[0]
422
+ r2 = client.predict(q, "mistralai/Mixtral-8x7B-Instruct-v0.1",
423
+ True, api_name="/ask_llm")
424
 
425
  # Combine for final text output
426
  result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
427
  st.markdown(result)
428
 
429
+ # Parse and process papers
430
+ papers = parse_arxiv_refs(refs)
431
+ if papers:
432
+ create_paper_audio_files(papers)
433
+ display_papers(papers)
434
+ else:
435
+ st.warning("No papers found in the response.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
 
 
 
437
  elapsed = time.time()-start
438
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
439
+
440
+ # Save full transcript
441
  create_file(q, result, "md")
442
  return result
443