Spaces:
Sleeping
Sleeping
Changed selenium retrieval implementation
Browse files- search_agent.py +22 -2
- search_agent_ui.py +2 -1
- web_crawler.py +17 -22
search_agent.py
CHANGED
@@ -41,6 +41,26 @@ import web_crawler as wc
|
|
41 |
console = Console()
|
42 |
dotenv.load_dotenv()
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
callbacks = []
|
45 |
if os.getenv("LANGCHAIN_API_KEY"):
|
46 |
callbacks.append(
|
@@ -59,7 +79,7 @@ if __name__ == '__main__':
|
|
59 |
query = arguments["SEARCH_QUERY"]
|
60 |
|
61 |
chat = wr.get_chat_llm(provider, model, temperature)
|
62 |
-
console.log(f"Using {
|
63 |
|
64 |
with console.status(f"[bold green]Optimizing query for search: {query}"):
|
65 |
optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
|
@@ -74,7 +94,7 @@ if __name__ == '__main__':
|
|
74 |
with console.status(
|
75 |
f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
|
76 |
):
|
77 |
-
contents = wc.get_links_contents(sources)
|
78 |
console.log(f"Managed to extract content from {len(contents)} sources")
|
79 |
|
80 |
with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):
|
|
|
41 |
console = Console()
|
42 |
dotenv.load_dotenv()
|
43 |
|
44 |
+
def get_selenium_driver():
|
45 |
+
from selenium import webdriver
|
46 |
+
from selenium.webdriver.chrome.options import Options
|
47 |
+
from selenium.common.exceptions import TimeoutException
|
48 |
+
|
49 |
+
chrome_options = Options()
|
50 |
+
chrome_options.add_argument("headless")
|
51 |
+
chrome_options.add_argument("--disable-extensions")
|
52 |
+
chrome_options.add_argument("--disable-gpu")
|
53 |
+
chrome_options.add_argument("--no-sandbox")
|
54 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
55 |
+
chrome_options.add_argument("--remote-debugging-port=9222")
|
56 |
+
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
|
57 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
58 |
+
|
59 |
+
driver = webdriver.Chrome(options=chrome_options)
|
60 |
+
return driver
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
callbacks = []
|
65 |
if os.getenv("LANGCHAIN_API_KEY"):
|
66 |
callbacks.append(
|
|
|
79 |
query = arguments["SEARCH_QUERY"]
|
80 |
|
81 |
chat = wr.get_chat_llm(provider, model, temperature)
|
82 |
+
#console.log(f"Using {model} on {provider} with temperature {temperature}")
|
83 |
|
84 |
with console.status(f"[bold green]Optimizing query for search: {query}"):
|
85 |
optimize_search_query = wr.optimize_search_query(chat, query, callbacks=callbacks)
|
|
|
94 |
with console.status(
|
95 |
f"[bold green]Fetching content for {len(sources)} sources", spinner="growVertical"
|
96 |
):
|
97 |
+
contents = wc.get_links_contents(sources, get_selenium_driver)
|
98 |
console.log(f"Managed to extract content from {len(contents)} sources")
|
99 |
|
100 |
with console.status(f"[bold green]Embeddubg {len(contents)} sources for content", spinner="growVertical"):
|
search_agent_ui.py
CHANGED
@@ -15,6 +15,7 @@ ls_tracer = LangChainTracer(
|
|
15 |
client=Client()
|
16 |
)
|
17 |
|
|
|
18 |
chat = wr.get_chat_llm(provider="cohere")
|
19 |
|
20 |
st.title("π Simple Search Agent π¬")
|
@@ -43,7 +44,7 @@ if prompt := st.chat_input():
|
|
43 |
|
44 |
|
45 |
with st.spinner(f"Searching the web for: {optimize_search_query}"):
|
46 |
-
sources = wc.get_sources(optimize_search_query)
|
47 |
|
48 |
with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
|
49 |
contents = wc.get_links_contents(sources)
|
|
|
15 |
client=Client()
|
16 |
)
|
17 |
|
18 |
+
|
19 |
chat = wr.get_chat_llm(provider="cohere")
|
20 |
|
21 |
st.title("π Simple Search Agent π¬")
|
|
|
44 |
|
45 |
|
46 |
with st.spinner(f"Searching the web for: {optimize_search_query}"):
|
47 |
+
sources = wc.get_sources(optimize_search_query, max_pages=20)
|
48 |
|
49 |
with st.spinner(f"I'm now retrieveing the {len(sources)} webpages and documents I found (be patient)"):
|
50 |
contents = wc.get_links_contents(sources)
|
web_crawler.py
CHANGED
@@ -52,19 +52,9 @@ def get_sources(query, max_pages=10, domain=None):
|
|
52 |
print('Error fetching search results:', error)
|
53 |
raise
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
chrome_options.add_argument("--disable-extensions")
|
59 |
-
chrome_options.add_argument("--disable-gpu")
|
60 |
-
chrome_options.add_argument("--no-sandbox")
|
61 |
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
62 |
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
63 |
-
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
|
64 |
-
chrome_options.add_argument("--window-size=1920,1080")
|
65 |
-
|
66 |
-
driver = webdriver.Chrome(options=chrome_options)
|
67 |
-
|
68 |
try:
|
69 |
driver.set_page_load_timeout(timeout)
|
70 |
driver.get(url)
|
@@ -118,28 +108,33 @@ def process_source(source):
|
|
118 |
return {**source, 'page_content': source['snippet']}
|
119 |
return {**source, 'page_content': None}
|
120 |
|
121 |
-
def get_links_contents(sources):
|
122 |
with ThreadPoolExecutor() as executor:
|
123 |
-
results = list(executor.map(process_source, sources))
|
|
|
|
|
|
|
|
|
124 |
for result in results:
|
125 |
if result['page_content'] is None:
|
126 |
url = result['link']
|
127 |
print(f"Fetching with selenium {url}")
|
128 |
-
|
|
|
129 |
main_content = extract(html, output_format='txt', include_links=True)
|
130 |
if main_content:
|
131 |
result['page_content'] = main_content
|
132 |
-
|
133 |
-
# Filter out None results
|
134 |
-
return [result for result in results if result is not None]
|
135 |
|
136 |
def vectorize(contents):
|
137 |
documents = []
|
138 |
for content in contents:
|
139 |
try:
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
143 |
except Exception as e:
|
144 |
print(f"[gray]Error processing content for {content['link']}: {e}")
|
145 |
semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
|
|
|
52 |
print('Error fetching search results:', error)
|
53 |
raise
|
54 |
|
55 |
+
|
56 |
+
|
57 |
+
def fetch_with_selenium(url, driver, timeout=8,):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
try:
|
59 |
driver.set_page_load_timeout(timeout)
|
60 |
driver.get(url)
|
|
|
108 |
return {**source, 'page_content': source['snippet']}
|
109 |
return {**source, 'page_content': None}
|
110 |
|
111 |
+
def get_links_contents(sources, get_driver_func=None):
|
112 |
with ThreadPoolExecutor() as executor:
|
113 |
+
results = list(executor.map(process_source, sources))
|
114 |
+
|
115 |
+
if get_driver_func is None:
|
116 |
+
return [result for result in results if result is not None]
|
117 |
+
|
118 |
for result in results:
|
119 |
if result['page_content'] is None:
|
120 |
url = result['link']
|
121 |
print(f"Fetching with selenium {url}")
|
122 |
+
driver = get_driver_func()
|
123 |
+
html = fetch_with_selenium(url, driver)
|
124 |
main_content = extract(html, output_format='txt', include_links=True)
|
125 |
if main_content:
|
126 |
result['page_content'] = main_content
|
127 |
+
return results
|
|
|
|
|
128 |
|
129 |
def vectorize(contents):
|
130 |
documents = []
|
131 |
for content in contents:
|
132 |
try:
|
133 |
+
page_content = content['page_content']
|
134 |
+
if page_content: # Sometimes Selenium is not fetching properly
|
135 |
+
metadata = {'title': content['title'], 'source': content['link']}
|
136 |
+
doc = Document(page_content=content['page_content'], metadata=metadata)
|
137 |
+
documents.append(doc)
|
138 |
except Exception as e:
|
139 |
print(f"[gray]Error processing content for {content['link']}: {e}")
|
140 |
semantic_chunker = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-large"), breakpoint_threshold_type="percentile")
|