awacke1 commited on
Commit
711281e
1 Parent(s): 8e010af

Update backup8.app.py

Browse files
Files changed (1) hide show
  1. backup8.app.py +125 -121
backup8.app.py CHANGED
@@ -299,146 +299,150 @@ def save_full_transcript(query, text):
299
  """Save full transcript of Arxiv results as a file."""
300
  create_file(query, text, "md")
301
 
302
- # ------------------------------
303
- # NEW: Helper to parse references
304
- # ------------------------------
305
  def parse_arxiv_refs(ref_text: str):
306
  """
307
- Parse the multi-line references returned by the RAG pipeline.
308
- Typical format lines like:
309
- 1) [Paper Title 2023] This is the summary ...
310
- 2) [Another Title (2024)] Another summary text ...
311
- We'll attempt to find a year with a small regex or fallback.
312
- Return list of dicts: { 'title': str, 'summary': str, 'year': int or None }
313
  """
314
- lines = ref_text.split('\n')
315
- results = []
316
- for line in lines:
317
- line = line.strip()
318
- if not line:
319
- continue
320
- # Attempt to find [Title ...]
321
- title_match = re.search(r"\[([^\]]+)\]", line)
322
- if title_match:
323
- raw_title = title_match.group(1).strip()
324
- else:
325
- # If no bracket found, skip or treat entire line as summary
326
- raw_title = "No Title"
327
-
328
- # Attempt to find trailing summary after bracket
329
- # Example line: " [Paper Title 2024] Paper summary blah blah"
330
- # So remove the bracketed portion from the line
331
- remainder = line.replace(title_match.group(0), "").strip() if title_match else line
332
- summary = remainder
333
-
334
- # Attempt to guess year from the raw title
335
- # We look for 4-digit patterns in raw_title or summary
336
- year_match = re.search(r'(20\d{2})', raw_title)
337
- if not year_match:
338
- # fallback: try summary
339
- year_match = re.search(r'(20\d{2})', summary)
340
- if year_match:
341
- year = int(year_match.group(1))
342
- else:
343
- year = None
344
 
345
- results.append({
346
- 'title': raw_title,
347
- 'summary': summary,
348
- 'year': year
349
- })
350
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
354
- titles_summary=True, full_audio=False):
355
- """Perform Arxiv search and generate audio summaries."""
356
  start = time.time()
357
 
358
- # 🎯 1) Query the HF RAG pipeline
359
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
360
- refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
361
- r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
 
 
 
362
 
363
- # 🎯 2) Combine for final text output
364
  result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
365
  st.markdown(result)
366
 
367
- # 🎯 3) Generate "all at once" audio if requested
368
- if full_audio:
369
- complete_text = f"Complete response for query: {q}. {clean_for_speech(r2)} {clean_for_speech(refs)}"
370
- audio_file_full = speak_with_edge_tts(complete_text)
371
- st.write("### 📚 Full Audio")
372
- play_and_download_audio(audio_file_full)
373
-
374
- if vocal_summary:
375
- main_text = clean_for_speech(r2)
376
- audio_file_main = speak_with_edge_tts(main_text)
377
- st.write("### 🎙 Short Audio")
378
- play_and_download_audio(audio_file_main)
379
-
380
- if extended_refs:
381
- summaries_text = "Extended references: " + refs.replace('"','')
382
- summaries_text = clean_for_speech(summaries_text)
383
- audio_file_refs = speak_with_edge_tts(summaries_text)
384
- st.write("### 📜 Long Refs")
385
- play_and_download_audio(audio_file_refs)
386
-
387
- # --------------------------------------
388
- # NEW: Parse references, show sorted list
389
- # --------------------------------------
390
- parsed_refs = parse_arxiv_refs(refs)
391
-
392
- # Sort by year descending (put None at bottom)
393
- # If you want to skip older than 2022, you can filter them:
394
- # parsed_refs = [r for r in parsed_refs if (r["year"] is not None and r["year"] >= 2022)]
395
- parsed_refs.sort(key=lambda x: x["year"] if x["year"] else 0, reverse=True)
396
-
397
- st.write("## Individual Papers (Most Recent First)")
398
- for idx, paper in enumerate(parsed_refs):
399
- year_str = paper["year"] if paper["year"] else "Unknown Year"
400
- st.markdown(f"**{idx+1}. {paper['title']}** \n*Year:* {year_str}")
401
- st.markdown(f"*Summary:* {paper['summary']}")
402
-
403
- # Two new TTS buttons: Title only or Title+Summary
404
- colA, colB = st.columns(2)
405
- with colA:
406
- if st.button(f"🔊 Title", key=f"title_{idx}"):
407
- text_tts = clean_for_speech(paper['title'])
408
- audio_file_title = speak_with_edge_tts(text_tts)
409
- play_and_download_audio(audio_file_title)
410
-
411
- with colB:
412
- if st.button(f"🔊 Title+Summary", key=f"summary_{idx}"):
413
- text_tts = clean_for_speech(paper['title'] + ". " + paper['summary'])
414
- audio_file_title_summary = speak_with_edge_tts(text_tts)
415
- play_and_download_audio(audio_file_title_summary)
416
-
417
- st.write("---")
418
-
419
- # Keep your original block for "Titles Only" if you want:
420
- if titles_summary:
421
- # This is your existing code block
422
- titles = []
423
- for line in refs.split('\n'):
424
- m = re.search(r"\[([^\]]+)\]", line)
425
- if m:
426
- titles.append(m.group(1))
427
- if titles:
428
- titles_text = "Titles: " + ", ".join(titles)
429
- titles_text = clean_for_speech(titles_text)
430
- audio_file_titles = speak_with_edge_tts(titles_text)
431
- st.write("### 🔖 Titles (All-In-One)")
432
- play_and_download_audio(audio_file_titles)
433
 
434
  elapsed = time.time()-start
435
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
436
 
437
- # Always create a file with the result
438
  create_file(q, result, "md")
439
-
440
  return result
441
 
 
 
 
442
  def process_with_gpt(text):
443
  """Process text with GPT-4"""
444
  if not text:
@@ -598,9 +602,9 @@ def main():
598
 
599
  # Show input in a text box for editing if detected
600
  if val:
601
- val_stripped = val.replace('\n', ' ')
602
  edited_input = st.text_area("✏️ Edit Input:", value=val_stripped, height=100)
603
- edited_input = edited_input.replace('\n', ' ')
604
 
605
  run_option = st.selectbox("Model:", ["Arxiv", "GPT-4o", "Claude-3.5"])
606
  col1, col2 = st.columns(2)
 
299
  """Save full transcript of Arxiv results as a file."""
300
  create_file(query, text, "md")
301
 
302
+
303
+
 
304
  def parse_arxiv_refs(ref_text: str):
305
  """
306
+ Parse papers by finding lines with two pipe characters as title lines.
307
+ Returns list of paper dictionaries with audio files.
 
 
 
 
308
  """
309
+ if not ref_text:
310
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
+ results = []
313
+ current_paper = {}
314
+ lines = ref_text.split('\n')
315
+
316
+ for i, line in enumerate(lines):
317
+ # Check if this is a title line (contains exactly 2 pipe characters)
318
+ if line.count('|') == 2:
319
+ # If we have a previous paper, add it to results
320
+ if current_paper:
321
+ results.append(current_paper)
322
+ if len(results) >= 20: # Limit to 20 papers
323
+ break
324
+
325
+ # Parse new paper header
326
+ try:
327
+ # Remove ** and split by |
328
+ header_parts = line.strip('* ').split('|')
329
+ date = header_parts[0].strip()
330
+ title = header_parts[1].strip()
331
+ # Extract arXiv URL if present
332
+ url_match = re.search(r'(https://arxiv.org/\S+)', line)
333
+ url = url_match.group(1) if url_match else f"paper_{len(results)}"
334
+
335
+ current_paper = {
336
+ 'date': date,
337
+ 'title': title,
338
+ 'url': url,
339
+ 'authors': '',
340
+ 'summary': '',
341
+ 'content_start': i + 1 # Track where content begins
342
+ }
343
+ except Exception as e:
344
+ st.warning(f"Error parsing paper header: {str(e)}")
345
+ current_paper = {}
346
+ continue
347
+
348
+ # If we have a current paper and this isn't a title line, add to content
349
+ elif current_paper:
350
+ if not current_paper['authors']: # First line after title is authors
351
+ current_paper['authors'] = line.strip('* ')
352
+ else: # Rest is summary
353
+ if current_paper['summary']:
354
+ current_paper['summary'] += ' ' + line.strip()
355
+ else:
356
+ current_paper['summary'] = line.strip()
357
+
358
+ # Don't forget the last paper
359
+ if current_paper:
360
+ results.append(current_paper)
361
+
362
+ return results[:20] # Ensure we return maximum 20 papers
363
 
364
+ def create_paper_audio_files(papers):
365
+ """
366
+ Create audio files for each paper's components and add file paths to paper dict.
367
+ """
368
+ for paper in papers:
369
+ try:
370
+ # Generate audio for title
371
+ title_text = clean_for_speech(paper['title'])
372
+ title_file = speak_with_edge_tts(title_text)
373
+ paper['title_audio'] = title_file
374
+
375
+ # Generate audio for full content
376
+ full_text = f"{paper['title']} by {paper['authors']}. {paper['summary']}"
377
+ full_text = clean_for_speech(full_text)
378
+ full_file = speak_with_edge_tts(full_text)
379
+ paper['full_audio'] = full_file
380
+
381
+ except Exception as e:
382
+ st.warning(f"Error generating audio for paper {paper['title']}: {str(e)}")
383
+ paper['title_audio'] = None
384
+ paper['full_audio'] = None
385
+
386
+ def display_papers(papers):
387
+ """
388
+ Display papers with their audio controls using URLs as unique keys.
389
+ """
390
+ st.write("## Research Papers")
391
+
392
+ for idx, paper in enumerate(papers):
393
+ with st.expander(f"📄 {paper['title']}", expanded=True):
394
+ st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
395
+ st.markdown(f"*{paper['authors']}*")
396
+ st.markdown(paper['summary'])
397
+
398
+ # Audio controls in columns
399
+ col1, col2 = st.columns(2)
400
+
401
+ with col1:
402
+ if paper.get('title_audio'):
403
+ st.write("🎙️ Title Audio")
404
+ st.audio(paper['title_audio'])
405
+
406
+ with col2:
407
+ if paper.get('full_audio'):
408
+ st.write("📚 Full Paper Audio")
409
+ st.audio(paper['full_audio'])
410
 
411
  def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
412
+ titles_summary=True, full_audio=False):
413
+ """Perform Arxiv search with audio generation per paper."""
414
  start = time.time()
415
 
416
+ # Query the HF RAG pipeline
417
  client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
418
+ refs = client.predict(q, 20, "Semantic Search",
419
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
420
+ api_name="/update_with_rag_md")[0]
421
+ r2 = client.predict(q, "mistralai/Mixtral-8x7B-Instruct-v0.1",
422
+ True, api_name="/ask_llm")
423
 
424
+ # Combine for final text output
425
  result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
426
  st.markdown(result)
427
 
428
+ # Parse and process papers
429
+ papers = parse_arxiv_refs(refs)
430
+ if papers:
431
+ create_paper_audio_files(papers)
432
+ display_papers(papers)
433
+ else:
434
+ st.warning("No papers found in the response.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
  elapsed = time.time()-start
437
  st.write(f"**Total Elapsed:** {elapsed:.2f} s")
438
 
439
+ # Save full transcript
440
  create_file(q, result, "md")
 
441
  return result
442
 
443
+
444
+
445
+
446
  def process_with_gpt(text):
447
  """Process text with GPT-4"""
448
  if not text:
 
602
 
603
  # Show input in a text box for editing if detected
604
  if val:
605
+ val_stripped = val.replace('\\n', ' ')
606
  edited_input = st.text_area("✏️ Edit Input:", value=val_stripped, height=100)
607
+ #edited_input = edited_input.replace('\n', ' ')
608
 
609
  run_option = st.selectbox("Model:", ["Arxiv", "GPT-4o", "Claude-3.5"])
610
  col1, col2 = st.columns(2)