DeepResearchEvaluator

Running on CPU Upgrade

App Files Files Community

awacke1 commited on 9 days ago

Commit

e90888f

verified ·

1 Parent(s): ad5d2a3

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -81

app.py CHANGED Viewed

@@ -301,106 +301,143 @@ def save_full_transcript(query, text):
 def parse_arxiv_refs(ref_text: str):
     """
-    Parse paper references with format:
-    **DATE | TITLE | ⬇️**
-    AUTHORS
-    SUMMARY
-    Returns list of dicts with paper details, limited to 20 papers.
-    Returns empty list if parsing fails.
     """
-    try:
-        if not ref_text:
-            return []
-        # Split on the paper header pattern
-        papers = re.split(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
-        headers = re.findall(r'\*\*.*?\|\s*.*?\|\s*.*?\*\*', ref_text)
-        results = []
-        for i, (header, content) in enumerate(zip(headers, papers[1:])):
-            if i >= 20:  # Limit to 20 papers
-                break
             try:
-                # Parse header parts
-                header_parts = [p.strip() for p in header.strip('*').split('|')]
-                if len(header_parts) >= 2:
-                    date_str = header_parts[0].strip()
-                    title = header_parts[1].strip()
-                    # Parse content into authors and summary
-                    content_parts = content.strip().split('\n', 1)
-                    authors = content_parts[0].strip('*') if content_parts else ""
-                    summary = content_parts[1].strip() if len(content_parts) > 1 else ""
-                    # Extract year from date
-                    year_match = re.search(r'20\d{2}', date_str)
-                    year = int(year_match.group(0)) if year_match else None
-                    results.append({
-                        'title': title,
-                        'summary': summary,
-                        'authors': authors,
-                        'year': year,
-                        'date': date_str
-                    })
             except Exception as e:
-                st.warning(f"Error parsing paper {i+1}: {str(e)}")
                 continue
-        return results
-    except Exception as e:
-        st.error(f"Error parsing papers: {str(e)}")
-        return []
 def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
-                      titles_summary=True, full_audio=False):
-    """Perform Arxiv search and generate audio summaries."""
     start = time.time()
     # Query the HF RAG pipeline
     client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
-    refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
-    r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
     # Combine for final text output
     result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
     st.markdown(result)
-    # Parse references
-    parsed_refs = parse_arxiv_refs(refs)
-    # Sort only if we have results
-    if parsed_refs:
-        parsed_refs.sort(key=lambda x: x.get("year", 0) if x.get("year") else 0, reverse=True)
-    # Display papers
-    st.write("## Research Papers")
-    for idx, paper in enumerate(parsed_refs):
-        st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
-        st.markdown(f"*{paper['authors']}*")
-        st.markdown(paper['summary'])
-        # Audio controls
-        colA, colB = st.columns(2)
-        with colA:
-            if st.button(f"🔊 Title", key=f"title_{idx}"):
-                text_tts = clean_for_speech(paper['title'])
-                audio_file_title = speak_with_edge_tts(text_tts)
-                play_and_download_audio(audio_file_title)
-        with colB:
-            if st.button(f"🔊 Full Details", key=f"summary_{idx}"):
-                text_tts = clean_for_speech(f"{paper['title']} by {paper['authors']}. {paper['summary']}")
-                audio_file_title_summary = speak_with_edge_tts(text_tts)
-                play_and_download_audio(audio_file_title_summary)
-        st.write("---")
-    # Rest of your existing function...
     elapsed = time.time()-start
     st.write(f"**Total Elapsed:** {elapsed:.2f} s")
     create_file(q, result, "md")
     return result

 def parse_arxiv_refs(ref_text: str):
     """
+    Parse papers by finding lines with two pipe characters as title lines.
+    Returns list of paper dictionaries with audio files.
     """
+    if not ref_text:
+        return []
+    results = []
+    current_paper = {}
+    lines = ref_text.split('\n')
+    for i, line in enumerate(lines):
+        # Check if this is a title line (contains exactly 2 pipe characters)
+        if line.count('|') == 2:
+            # If we have a previous paper, add it to results
+            if current_paper:
+                results.append(current_paper)
+                if len(results) >= 20:  # Limit to 20 papers
+                    break
+            # Parse new paper header
             try:
+                # Remove ** and split by |
+                header_parts = line.strip('* ').split('|')
+                date = header_parts[0].strip()
+                title = header_parts[1].strip()
+                # Extract arXiv URL if present
+                url_match = re.search(r'(https://arxiv.org/\S+)', line)
+                url = url_match.group(1) if url_match else f"paper_{len(results)}"
+                current_paper = {
+                    'date': date,
+                    'title': title,
+                    'url': url,
+                    'authors': '',
+                    'summary': '',
+                    'content_start': i + 1  # Track where content begins
+                }
             except Exception as e:
+                st.warning(f"Error parsing paper header: {str(e)}")
+                current_paper = {}
                 continue
+        # If we have a current paper and this isn't a title line, add to content
+        elif current_paper:
+            if not current_paper['authors']:  # First line after title is authors
+                current_paper['authors'] = line.strip('* ')
+            else:  # Rest is summary
+                if current_paper['summary']:
+                    current_paper['summary'] += ' ' + line.strip()
+                else:
+                    current_paper['summary'] = line.strip()
+    # Don't forget the last paper
+    if current_paper:
+        results.append(current_paper)
+    return results[:20]  # Ensure we return maximum 20 papers
+def create_paper_audio_files(papers):
+    """
+    Create audio files for each paper's components and add file paths to paper dict.
+    """
+    for paper in papers:
+        try:
+            # Generate audio for title
+            title_text = clean_for_speech(paper['title'])
+            title_file = speak_with_edge_tts(title_text)
+            paper['title_audio'] = title_file
+            # Generate audio for full content
+            full_text = f"{paper['title']} by {paper['authors']}. {paper['summary']}"
+            full_text = clean_for_speech(full_text)
+            full_file = speak_with_edge_tts(full_text)
+            paper['full_audio'] = full_file
+        except Exception as e:
+            st.warning(f"Error generating audio for paper {paper['title']}: {str(e)}")
+            paper['title_audio'] = None
+            paper['full_audio'] = None
+def display_papers(papers):
+    """
+    Display papers with their audio controls using URLs as unique keys.
+    """
+    st.write("## Research Papers")
+    for idx, paper in enumerate(papers):
+        with st.expander(f"📄 {paper['title']}", expanded=True):
+            st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
+            st.markdown(f"*{paper['authors']}*")
+            st.markdown(paper['summary'])
+            # Audio controls in columns
+            col1, col2 = st.columns(2)
+            # Use URL as unique key for audio interface
+            key_base = paper['url'].split('/')[-1] if paper['url'].startswith('http') else paper['url']
+            with col1:
+                if paper.get('title_audio'):
+                    st.write("🎙️ Title Audio")
+                    st.audio(paper['title_audio'], key=f"title_{key_base}")
+            with col2:
+                if paper.get('full_audio'):
+                    st.write("📚 Full Paper Audio")
+                    st.audio(paper['full_audio'], key=f"full_{key_base}")
 def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
+                     titles_summary=True, full_audio=False):
+    """Perform Arxiv search with audio generation per paper."""
     start = time.time()
     # Query the HF RAG pipeline
     client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
+    refs = client.predict(q, 20, "Semantic Search",
+                         "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                         api_name="/update_with_rag_md")[0]
+    r2 = client.predict(q, "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                       True, api_name="/ask_llm")
     # Combine for final text output
     result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
     st.markdown(result)
+    # Parse and process papers
+    papers = parse_arxiv_refs(refs)
+    if papers:
+        create_paper_audio_files(papers)
+        display_papers(papers)
+    else:
+        st.warning("No papers found in the response.")
     elapsed = time.time()-start
     st.write(f"**Total Elapsed:** {elapsed:.2f} s")
+    # Save full transcript
     create_file(q, result, "md")
     return result