eljanmahammadli commited on
Commit
5650543
·
1 Parent(s): fa3e7dd

added pagintion to google search, now retrieving more sites

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. google_search.py +47 -12
.gitignore CHANGED
@@ -8,4 +8,5 @@ nohup.out
8
  temp.py
9
  temp.ipynb
10
  chroma_db/
11
- temp.txt
 
 
8
  temp.py
9
  temp.ipynb
10
  chroma_db/
11
+ temp.txt
12
+ temp*
google_search.py CHANGED
@@ -100,6 +100,7 @@ months = {
100
  }
101
 
102
  domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
 
103
 
104
 
105
  def build_date(year=2024, month="March", day=1):
@@ -175,22 +176,47 @@ def google_search_urls(
175
  domains_to_include,
176
  api_key,
177
  cse_id,
 
 
 
178
  **kwargs,
179
  ):
 
 
 
180
  service = build("customsearch", "v1", developerKey=api_key)
181
- results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
182
  url_list = []
183
- if "items" in results and len(results["items"]) > 0:
184
- for count, link in enumerate(results["items"]):
185
- # skip user selected domains
186
- if (domains_to_include is None) or not any(
187
- ("." + domain) in link["link"] for domain in domains_to_include
188
- ):
189
- continue
190
- url = link["link"]
191
- if url not in url_list:
192
- url_list.append(url)
193
- return url_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
  def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
@@ -206,6 +232,15 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
206
  api_key,
207
  cse_id,
208
  )
 
 
 
 
209
  print("Google Search processing time: ", time.perf_counter() - start_time)
210
  result_content = build_results_beautifulsoup(url_list)
211
  return result_content
 
 
 
 
 
 
100
  }
101
 
102
  domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
103
+ skip_urls = ["youtube.com", "twitter.com", "facebook.com", "instagram.com", "x.com"]
104
 
105
 
106
  def build_date(year=2024, month="March", day=1):
 
176
  domains_to_include,
177
  api_key,
178
  cse_id,
179
+ num_results=10, # Number of results to fetch per page
180
+ total_results=30, # Total number of results to fetch
181
+ skip_urls=None, # List of URLs to skip
182
  **kwargs,
183
  ):
184
+ if skip_urls is None:
185
+ skip_urls = [] # Initialize as empty list if not provided
186
+
187
  service = build("customsearch", "v1", developerKey=api_key)
 
188
  url_list = []
189
+ start_index = 1 # Initial index for the search results
190
+ while len(url_list) < total_results:
191
+ # Fetch a page of results
192
+ results = service.cse().list(
193
+ q=text,
194
+ cx=cse_id,
195
+ sort=sorted_date,
196
+ start=start_index,
197
+ num=min(num_results, total_results - len(url_list)),
198
+ **kwargs
199
+ ).execute()
200
+
201
+ if "items" in results and len(results["items"]) > 0:
202
+ for count, link in enumerate(results["items"]):
203
+ url = link["link"]
204
+ # Skip if the URL is in the skip_urls list or doesn't match the domain filter
205
+ if url in skip_urls:
206
+ continue
207
+ if (domains_to_include is None) or any(
208
+ ("." + domain) in url for domain in domains_to_include
209
+ ):
210
+ if url not in url_list:
211
+ url_list.append(url)
212
+ else:
213
+ # No more results
214
+ break
215
+
216
+ # Move to the next page of results
217
+ start_index += num_results
218
+
219
+ return url_list[:total_results]
220
 
221
 
222
  def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
 
232
  api_key,
233
  cse_id,
234
  )
235
+ print("---")
236
+ print(len(url_list))
237
+ print(url_list)
238
+ print("---")
239
  print("Google Search processing time: ", time.perf_counter() - start_time)
240
  result_content = build_results_beautifulsoup(url_list)
241
  return result_content
242
+
243
+
244
+ res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
245
+ print(res.keys())
246
+ print(len(res))