CyranoB commited on
Commit
542890e
Β·
1 Parent(s): d594a38

Changed selenium retrieval implementation

Browse files
Files changed (3) hide show
  1. search_agent.py +22 -2
  2. search_agent_ui.py +2 -1
  3. web_crawler.py +17 -22
search_agent.py CHANGED
@@ -41,6 +41,26 @@ import web_crawler as wc
41
  console = Console()
42
  dotenv.load_dotenv()
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  callbacks = []
45
  if os.getenv("LANGCHAIN_API_KEY"):
46
  callbacks.append(
@@ -59,7 +79,7 @@ if __name__ == '__main__':
59
  query = arguments["SEARCH_QUERY"]
60
 
61
  chat = wr.get_chat_llm(provider, model, temperature)
62
- console.log(f"Using {chat.get_name} on {provider} with temperature {temperature}")
63
 
64
  with console.status(f"[bold green]Optimizing query for search: {query}"):
65
  optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
@@ -74,7 +94,7 @@ if __name__ == '__main__':
74
  with console.status(
75
  f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
76
  ):
77
- contents = wc.get_links_contents(sources)
78
  console.log(f"Managed to extract content from {len(contents)} sources")
79
 
80
  with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):
 
41
  console = Console()
42
  dotenv.load_dotenv()
43
 
44
+ def get_selenium_driver():
45
+ from selenium import webdriver
46
+ from selenium.webdriver.chrome.options import Options
47
+ from selenium.common.exceptions import TimeoutException
48
+
49
+ chrome_options = Options()
50
+ chrome_options.add_argument("headless")
51
+ chrome_options.add_argument("--disable-extensions")
52
+ chrome_options.add_argument("--disable-gpu")
53
+ chrome_options.add_argument("--no-sandbox")
54
+ chrome_options.add_argument("--disable-dev-shm-usage")
55
+ chrome_options.add_argument("--remote-debugging-port=9222")
56
+ chrome_options.add_argument('--blink-settings=imagesEnabled=false')
57
+ chrome_options.add_argument("--window-size=1920,1080")
58
+
59
+ driver = webdriver.Chrome(options=chrome_options)
60
+ return driver
61
+
62
+
63
+
64
  callbacks = []
65
  if os.getenv("LANGCHAIN_API_KEY"):
66
  callbacks.append(
 
79
  query = arguments["SEARCH_QUERY"]
80
 
81
  chat = wr.get_chat_llm(provider, model, temperature)
82
+ #console.log(f"Using {model} on {provider} with temperature {temperature}")
83
 
84
  with console.status(f"[bold green]Optimizing query for search: {query}"):
85
  optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
 
94
  with console.status(
95
  f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
96
  ):
97
+ contents = wc.get_links_contents(sources, get_selenium_driver)
98
  console.log(f"Managed to extract content from {len(contents)} sources")
99
 
100
  with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):
search_agent_ui.py CHANGED
@@ -15,6 +15,7 @@ ls_tracer = LangChainTracer(
15
  client=Client()
16
  )
17
 
 
18
  chat = wr.get_chat_llm(provider="cohere")
19
 
20
  st.title("πŸ” Simple Search Agent πŸ’¬")
@@ -43,7 +44,7 @@ if prompt := st.chat_input():
43
 
44
 
45
  with st.spinner(f"Searching the web for: {optimize_search_query}"):
46
- sources = wc.get_sources(optimize_search_query)
47
 
48
  with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
49
  contents = wc.get_links_contents(sources)
 
15
  client=Client()
16
  )
17
 
18
+
19
  chat = wr.get_chat_llm(provider="cohere")
20
 
21
  st.title("πŸ” Simple Search Agent πŸ’¬")
 
44
 
45
 
46
  with st.spinner(f"Searching the web for: {optimize_search_query}"):
47
+ sources = wc.get_sources(optimize_search_query, max_pages=20)
48
 
49
  with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
50
  contents = wc.get_links_contents(sources)
web_crawler.py CHANGED
@@ -52,19 +52,9 @@ def get_sources(query, max_pages=10, domain=None):
52
  print('Error fetching search results:', error)
53
  raise
54
 
55
- def fetch_with_selenium(url, timeout=8):
56
- chrome_options = Options()
57
- chrome_options.add_argument("headless")
58
- chrome_options.add_argument("--disable-extensions")
59
- chrome_options.add_argument("--disable-gpu")
60
- chrome_options.add_argument("--no-sandbox")
61
- chrome_options.add_argument("--disable-dev-shm-usage")
62
- chrome_options.add_argument("--remote-debugging-port=9222")
63
- chrome_options.add_argument('--blink-settings=imagesEnabled=false')
64
- chrome_options.add_argument("--window-size=1920,1080")
65
-
66
- driver = webdriver.Chrome(options=chrome_options)
67
-
68
  try:
69
  driver.set_page_load_timeout(timeout)
70
  driver.get(url)
@@ -118,28 +108,33 @@ def process_source(source):
118
  return {**source, 'page_content': source['snippet']}
119
  return {**source, 'page_content': None}
120
 
121
- def get_links_contents(sources):
122
  with ThreadPoolExecutor() as executor:
123
- results = list(executor.map(process_source, sources))
 
 
 
 
124
  for result in results:
125
  if result['page_content'] is None:
126
  url = result['link']
127
  print(f"Fetching with selenium {url}")
128
- html = fetch_with_selenium(url, 8)
 
129
  main_content = extract(html, output_format='txt', include_links=True)
130
  if main_content:
131
  result['page_content'] = main_content
132
-
133
- # Filter out None results
134
- return [result for result in results if result is not None]
135
 
136
  def vectorize(contents):
137
  documents = []
138
  for content in contents:
139
  try:
140
- metadata = {'title': content['title'], 'source': content['link']}
141
- doc = Document(page_content=content['page_content'], metadata=metadata)
142
- documents.append(doc)
 
 
143
  except Exception as e:
144
  print(f"[gray]Error processing content for {content['link']}: {e}")
145
  semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
 
52
  print('Error fetching search results:', error)
53
  raise
54
 
55
+
56
+
57
+ def fetch_with_selenium(url, driver, timeout=8,):
 
 
 
 
 
 
 
 
 
 
58
  try:
59
  driver.set_page_load_timeout(timeout)
60
  driver.get(url)
 
108
  return {**source, 'page_content': source['snippet']}
109
  return {**source, 'page_content': None}
110
 
111
+ def get_links_contents(sources, get_driver_func=None):
112
  with ThreadPoolExecutor() as executor:
113
+ results = list(executor.map(process_source, sources))
114
+
115
+ if get_driver_func is None:
116
+ return [result for result in results if result is not None]
117
+
118
  for result in results:
119
  if result['page_content'] is None:
120
  url = result['link']
121
  print(f"Fetching with selenium {url}")
122
+ driver = get_driver_func()
123
+ html = fetch_with_selenium(url, driver)
124
  main_content = extract(html, output_format='txt', include_links=True)
125
  if main_content:
126
  result['page_content'] = main_content
127
+ return results
 
 
128
 
129
  def vectorize(contents):
130
  documents = []
131
  for content in contents:
132
  try:
133
+ page_content = content['page_content']
134
+ if page_content: # Sometimes Selenium is not fetching properly
135
+ metadata = {'title': content['title'], 'source': content['link']}
136
+ doc = Document(page_content=content['page_content'], metadata=metadata)
137
+ documents.append(doc)
138
  except Exception as e:
139
  print(f"[gray]Error processing content for {content['link']}: {e}")
140
  semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")