Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
5650543
1
Parent(s):
fa3e7dd
added pagintion to google search, now retrieving more sites
Browse files- .gitignore +2 -1
- google_search.py +47 -12
.gitignore
CHANGED
@@ -8,4 +8,5 @@ nohup.out
|
|
8 |
temp.py
|
9 |
temp.ipynb
|
10 |
chroma_db/
|
11 |
-
temp.txt
|
|
|
|
8 |
temp.py
|
9 |
temp.ipynb
|
10 |
chroma_db/
|
11 |
+
temp.txt
|
12 |
+
temp*
|
google_search.py
CHANGED
@@ -100,6 +100,7 @@ months = {
|
|
100 |
}
|
101 |
|
102 |
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
|
|
103 |
|
104 |
|
105 |
def build_date(year=2024, month="March", day=1):
|
@@ -175,22 +176,47 @@ def google_search_urls(
|
|
175 |
domains_to_include,
|
176 |
api_key,
|
177 |
cse_id,
|
|
|
|
|
|
|
178 |
**kwargs,
|
179 |
):
|
|
|
|
|
|
|
180 |
service = build("customsearch", "v1", developerKey=api_key)
|
181 |
-
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
|
182 |
url_list = []
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
|
196 |
def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
@@ -206,6 +232,15 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
|
206 |
api_key,
|
207 |
cse_id,
|
208 |
)
|
|
|
|
|
|
|
|
|
209 |
print("Google Search processing time: ", time.perf_counter() - start_time)
|
210 |
result_content = build_results_beautifulsoup(url_list)
|
211 |
return result_content
|
|
|
|
|
|
|
|
|
|
|
|
100 |
}
|
101 |
|
102 |
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
103 |
+
skip_urls = ["youtube.com", "twitter.com", "facebook.com", "instagram.com", "x.com"]
|
104 |
|
105 |
|
106 |
def build_date(year=2024, month="March", day=1):
|
|
|
176 |
domains_to_include,
|
177 |
api_key,
|
178 |
cse_id,
|
179 |
+
num_results=10, # Number of results to fetch per page
|
180 |
+
total_results=30, # Total number of results to fetch
|
181 |
+
skip_urls=None, # List of URLs to skip
|
182 |
**kwargs,
|
183 |
):
|
184 |
+
if skip_urls is None:
|
185 |
+
skip_urls = [] # Initialize as empty list if not provided
|
186 |
+
|
187 |
service = build("customsearch", "v1", developerKey=api_key)
|
|
|
188 |
url_list = []
|
189 |
+
start_index = 1 # Initial index for the search results
|
190 |
+
while len(url_list) < total_results:
|
191 |
+
# Fetch a page of results
|
192 |
+
results = service.cse().list(
|
193 |
+
q=text,
|
194 |
+
cx=cse_id,
|
195 |
+
sort=sorted_date,
|
196 |
+
start=start_index,
|
197 |
+
num=min(num_results, total_results - len(url_list)),
|
198 |
+
**kwargs
|
199 |
+
).execute()
|
200 |
+
|
201 |
+
if "items" in results and len(results["items"]) > 0:
|
202 |
+
for count, link in enumerate(results["items"]):
|
203 |
+
url = link["link"]
|
204 |
+
# Skip if the URL is in the skip_urls list or doesn't match the domain filter
|
205 |
+
if url in skip_urls:
|
206 |
+
continue
|
207 |
+
if (domains_to_include is None) or any(
|
208 |
+
("." + domain) in url for domain in domains_to_include
|
209 |
+
):
|
210 |
+
if url not in url_list:
|
211 |
+
url_list.append(url)
|
212 |
+
else:
|
213 |
+
# No more results
|
214 |
+
break
|
215 |
+
|
216 |
+
# Move to the next page of results
|
217 |
+
start_index += num_results
|
218 |
+
|
219 |
+
return url_list[:total_results]
|
220 |
|
221 |
|
222 |
def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
|
|
232 |
api_key,
|
233 |
cse_id,
|
234 |
)
|
235 |
+
print("---")
|
236 |
+
print(len(url_list))
|
237 |
+
print(url_list)
|
238 |
+
print("---")
|
239 |
print("Google Search processing time: ", time.perf_counter() - start_time)
|
240 |
result_content = build_results_beautifulsoup(url_list)
|
241 |
return result_content
|
242 |
+
|
243 |
+
|
244 |
+
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
|
245 |
+
print(res.keys())
|
246 |
+
print(len(res))
|