Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
a6fbfb6
1
Parent(s):
5650543
chore: increase of numbers to scrape; disabled PDF check in scholar model
Browse files- app.py +3 -3
- google_search.py +7 -7
app.py
CHANGED
@@ -21,9 +21,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
21 |
from google.cloud import storage
|
22 |
|
23 |
if gr.NO_RELOAD:
|
24 |
-
|
25 |
-
humanize_text = None
|
26 |
-
device = None
|
27 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
28 |
from google_search import google_search, months, domain_list, build_date
|
29 |
from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
|
|
|
21 |
from google.cloud import storage
|
22 |
|
23 |
if gr.NO_RELOAD:
|
24 |
+
from humanize import humanize_text, device
|
25 |
+
# humanize_text = None
|
26 |
+
# device = None
|
27 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
28 |
from google_search import google_search, months, domain_list, build_date
|
29 |
from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
|
google_search.py
CHANGED
@@ -14,7 +14,7 @@ API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
|
14 |
CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
15 |
|
16 |
# Number of pages to scrape
|
17 |
-
NUM_PAGES =
|
18 |
|
19 |
def build_results_beautifulsoup(url_list):
|
20 |
print("Starting to scrape URLs...")
|
@@ -223,8 +223,8 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
|
223 |
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
224 |
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
225 |
start_time = time.perf_counter()
|
226 |
-
if scholar_mode_check:
|
227 |
-
|
228 |
url_list = google_search_urls(
|
229 |
topic,
|
230 |
sorted_date,
|
@@ -240,7 +240,7 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
|
240 |
result_content = build_results_beautifulsoup(url_list)
|
241 |
return result_content
|
242 |
|
243 |
-
|
244 |
-
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
|
245 |
-
print(res.keys())
|
246 |
-
print(len(res))
|
|
|
14 |
CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
15 |
|
16 |
# Number of pages to scrape
|
17 |
+
NUM_PAGES = 20
|
18 |
|
19 |
def build_results_beautifulsoup(url_list):
|
20 |
print("Starting to scrape URLs...")
|
|
|
223 |
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
224 |
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
225 |
start_time = time.perf_counter()
|
226 |
+
# if scholar_mode_check:
|
227 |
+
# topic += " -filetype:pdf"
|
228 |
url_list = google_search_urls(
|
229 |
topic,
|
230 |
sorted_date,
|
|
|
240 |
result_content = build_results_beautifulsoup(url_list)
|
241 |
return result_content
|
242 |
|
243 |
+
if __name__ == "__main__":
|
244 |
+
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
|
245 |
+
print(res.keys())
|
246 |
+
print(len(res))
|