Spaces:

CyranoB
/

search_agent

Sleeping

App Files Files Community

CyranoB commited on Apr 13, 2024

Commit

542890e

1 Parent(s): d594a38

Changed selenium retrieval implementation

Browse files

Files changed (3) hide show

search_agent.py +22 -2
search_agent_ui.py +2 -1
web_crawler.py +17 -22

search_agent.py CHANGED Viewed

@@ -41,6 +41,26 @@ import web_crawler as wc
 console = Console()
 dotenv.load_dotenv()
 callbacks = []
 if os.getenv("LANGCHAIN_API_KEY"):
     callbacks.append(
@@ -59,7 +79,7 @@ if __name__ == '__main__':
     query = arguments["SEARCH_QUERY"]
     chat = wr.get_chat_llm(provider, model, temperature)
-    console.log(f"Using {chat.get_name} on {provider} with temperature {temperature}")
     with console.status(f"[bold green]Optimizing query for search: {query}"):
         optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
@@ -74,7 +94,7 @@ if __name__ == '__main__':
     with console.status(
         f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
     ):
-        contents = wc.get_links_contents(sources)
     console.log(f"Managed to extract content from {len(contents)} sources")
     with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):

 console = Console()
 dotenv.load_dotenv()
+def get_selenium_driver():
+    from selenium import webdriver
+    from selenium.webdriver.chrome.options import Options
+    from selenium.common.exceptions import TimeoutException
+    chrome_options = Options()
+    chrome_options.add_argument("headless")
+    chrome_options.add_argument("--disable-extensions")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--remote-debugging-port=9222")
+    chrome_options.add_argument('--blink-settings=imagesEnabled=false')
+    chrome_options.add_argument("--window-size=1920,1080")
+    driver = webdriver.Chrome(options=chrome_options)
+    return driver
 callbacks = []
 if os.getenv("LANGCHAIN_API_KEY"):
     callbacks.append(
     query = arguments["SEARCH_QUERY"]
     chat = wr.get_chat_llm(provider, model, temperature)
+    #console.log(f"Using {model} on {provider} with temperature {temperature}")
     with console.status(f"[bold green]Optimizing query for search: {query}"):
         optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
     with console.status(
         f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
     ):
+        contents = wc.get_links_contents(sources, get_selenium_driver)
     console.log(f"Managed to extract content from {len(contents)} sources")
     with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):

search_agent_ui.py CHANGED Viewed

@@ -15,6 +15,7 @@ ls_tracer = LangChainTracer(
     client=Client()
 )
 chat = wr.get_chat_llm(provider="cohere")
 st.title("🔍 Simple Search Agent 💬")
@@ -43,7 +44,7 @@ if prompt := st.chat_input():
     with st.spinner(f"Searching the web for: {optimize_search_query}"):
-        sources = wc.get_sources(optimize_search_query)
     with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
         contents = wc.get_links_contents(sources)

     client=Client()
 )
 chat = wr.get_chat_llm(provider="cohere")
 st.title("🔍 Simple Search Agent 💬")
     with st.spinner(f"Searching the web for: {optimize_search_query}"):
+        sources = wc.get_sources(optimize_search_query, max_pages=20)
     with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
         contents = wc.get_links_contents(sources)

web_crawler.py CHANGED Viewed

@@ -52,19 +52,9 @@ def get_sources(query, max_pages=10, domain=None):
         print('Error fetching search results:', error)
         raise
-def fetch_with_selenium(url, timeout=8):
-    chrome_options = Options()
-    chrome_options.add_argument("headless")
-    chrome_options.add_argument("--disable-extensions")
-    chrome_options.add_argument("--disable-gpu")
-    chrome_options.add_argument("--no-sandbox")
-    chrome_options.add_argument("--disable-dev-shm-usage")
-    chrome_options.add_argument("--remote-debugging-port=9222")
-    chrome_options.add_argument('--blink-settings=imagesEnabled=false')
-    chrome_options.add_argument("--window-size=1920,1080")
-    driver = webdriver.Chrome(options=chrome_options)
     try:
         driver.set_page_load_timeout(timeout)
         driver.get(url)
@@ -118,28 +108,33 @@ def process_source(source):
             return {**source, 'page_content': source['snippet']}
     return {**source, 'page_content': None}
-def get_links_contents(sources):
     with ThreadPoolExecutor() as executor:
-        results = list(executor.map(process_source, sources))
     for result in results:
         if result['page_content'] is None:
             url = result['link']
             print(f"Fetching with selenium {url}")
-            html = fetch_with_selenium(url, 8)
             main_content = extract(html, output_format='txt', include_links=True)
             if main_content:
                 result['page_content'] = main_content
-    # Filter out None results
-    return [result for result in results if result is not None]
 def vectorize(contents):
     documents = []
     for content in contents:
         try:
-            metadata = {'title': content['title'], 'source': content['link']}
-            doc = Document(page_content=content['page_content'], metadata=metadata)
-            documents.append(doc)
         except Exception as e:
             print(f"[gray]Error processing content for {content['link']}: {e}")
     semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")

         print('Error fetching search results:', error)
         raise
+def fetch_with_selenium(url, driver, timeout=8,):
     try:
         driver.set_page_load_timeout(timeout)
         driver.get(url)
             return {**source, 'page_content': source['snippet']}
     return {**source, 'page_content': None}
+def get_links_contents(sources, get_driver_func=None):
     with ThreadPoolExecutor() as executor:
+        results = list(executor.map(process_source, sources))
+    if get_driver_func is None:
+        return [result for result in results if result is not None]
     for result in results:
         if result['page_content'] is None:
             url = result['link']
             print(f"Fetching with selenium {url}")
+            driver = get_driver_func()
+            html = fetch_with_selenium(url, driver)
             main_content = extract(html, output_format='txt', include_links=True)
             if main_content:
                 result['page_content'] = main_content
+    return results
 def vectorize(contents):
     documents = []
     for content in contents:
         try:
+            page_content = content['page_content']
+            if page_content: # Sometimes Selenium is not fetching properly
+                metadata = {'title': content['title'], 'source': content['link']}
+                doc = Document(page_content=content['page_content'], metadata=metadata)
+                documents.append(doc)
         except Exception as e:
             print(f"[gray]Error processing content for {content['link']}: {e}")
     semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")