Spaces:
Sleeping
Sleeping
remove unused code, add exceptions if variables not set
Browse files- app.py +34 -14
- cfg.py +28 -30
- rtd_scraper/scrape_rtd.py +0 -53
app.py
CHANGED
@@ -6,34 +6,54 @@ import pandas as pd
|
|
6 |
from buster.completers import Completion
|
7 |
|
8 |
# from embed_docs import embed_rtd_website
|
9 |
-
from rtd_scraper.scrape_rtd import scrape_rtd
|
|
|
10 |
import cfg
|
11 |
from cfg import setup_buster
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
if os.getenv("OPENAI_API_KEY") is None:
|
16 |
print(
|
17 |
-
"Warning: No
|
18 |
)
|
19 |
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
# scrape and embed content from readthedocs website
|
25 |
-
# comment out if already embedded locally to avoid extra costs
|
26 |
-
scrape_rtd(
|
27 |
-
homepage_url=homepage_url, save_directory="outputs/", target_version=target_version
|
28 |
-
)
|
29 |
|
|
|
|
|
30 |
|
31 |
-
#
|
32 |
-
|
|
|
|
|
|
|
|
|
33 |
|
|
|
34 |
buster = setup_buster(cfg.buster_cfg)
|
35 |
|
36 |
|
|
|
37 |
def add_user_question(
|
38 |
user_question: str, chat_history: Optional[ChatHistory] = None
|
39 |
) -> ChatHistory:
|
@@ -157,5 +177,5 @@ with demo:
|
|
157 |
)
|
158 |
|
159 |
|
160 |
-
demo.queue(concurrency_count=
|
161 |
demo.launch(share=False)
|
|
|
6 |
from buster.completers import Completion
|
7 |
|
8 |
# from embed_docs import embed_rtd_website
|
9 |
+
# from rtd_scraper.scrape_rtd import scrape_rtd
|
10 |
+
from embed_docs import embed_documents
|
11 |
import cfg
|
12 |
from cfg import setup_buster
|
13 |
|
14 |
+
# Typehint for chatbot history
|
15 |
+
ChatHistory = list[list[Optional[str], Optional[str]]]
|
16 |
+
|
17 |
+
|
18 |
+
# Because this is a one-click deploy app, we will be relying on env. variables being set
|
19 |
+
openai_api_key = os.getenv("OPENAI_API_KEY") # Mandatory for app to work
|
20 |
+
readthedocs_url = os.getenv("READTHEDOCS_URL") # Mandatory for app to work as intended
|
21 |
+
readthedocs_version = os.getenv("READTHEDOCS_VERSION")
|
22 |
|
23 |
+
if openai_api_key is None:
|
|
|
24 |
print(
|
25 |
+
"Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'."
|
26 |
)
|
27 |
|
28 |
+
if readthedocs_url is None:
|
29 |
+
raise ValueError(
|
30 |
+
"No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'"
|
31 |
+
)
|
32 |
|
33 |
+
if readthedocs_version is None:
|
34 |
+
print(
|
35 |
+
"""
|
36 |
+
Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped.
|
37 |
+
Set it with e.g. 'export READTHEDOCS_VERSION=en/stable'
|
38 |
+
"""
|
39 |
+
)
|
40 |
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Override to put it anywhere
|
43 |
+
save_directory = "outputs/"
|
44 |
|
45 |
+
# scrape and embed content from readthedocs website
|
46 |
+
embed_documents(
|
47 |
+
homepage_url=readthedocs_url,
|
48 |
+
save_directory=save_directory,
|
49 |
+
target_version=readthedocs_version,
|
50 |
+
)
|
51 |
|
52 |
+
# Setup RAG agent
|
53 |
buster = setup_buster(cfg.buster_cfg)
|
54 |
|
55 |
|
56 |
+
# Setup Gradio app
|
57 |
def add_user_question(
|
58 |
user_question: str, chat_history: Optional[ChatHistory] = None
|
59 |
) -> ChatHistory:
|
|
|
177 |
)
|
178 |
|
179 |
|
180 |
+
demo.queue(concurrency_count=8)
|
181 |
demo.launch(share=False)
|
cfg.py
CHANGED
@@ -6,37 +6,7 @@ from buster.retriever import DeepLakeRetriever, Retriever
|
|
6 |
from buster.tokenizers import GPTTokenizer
|
7 |
from buster.validators import QuestionAnswerValidator, Validator
|
8 |
|
9 |
-
from rtd_scraper.scrape_rtd import scrape_rtd
|
10 |
-
|
11 |
buster_cfg = BusterConfig(
|
12 |
-
validator_cfg={
|
13 |
-
"unknown_response_templates": [
|
14 |
-
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
|
15 |
-
],
|
16 |
-
"unknown_threshold": 0.85,
|
17 |
-
"embedding_model": "text-embedding-ada-002",
|
18 |
-
"use_reranking": True,
|
19 |
-
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
20 |
-
"check_question_prompt": """You are an chatbot answering questions on python libraries.
|
21 |
-
|
22 |
-
Your job is to determine wether or not a question is valid, and should be answered.
|
23 |
-
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
|
24 |
-
|
25 |
-
For example:
|
26 |
-
|
27 |
-
Q: How can I install the library?
|
28 |
-
true
|
29 |
-
|
30 |
-
Q: What is the meaning of life?
|
31 |
-
false
|
32 |
-
|
33 |
-
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
|
34 |
-
"completion_kwargs": {
|
35 |
-
"model": "gpt-3.5-turbo",
|
36 |
-
"stream": False,
|
37 |
-
"temperature": 0,
|
38 |
-
},
|
39 |
-
},
|
40 |
retriever_cfg={
|
41 |
"path": "outputs/deeplake_store",
|
42 |
"top_k": 3,
|
@@ -87,6 +57,34 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
|
|
87 |
"Now answer the following question:\n"
|
88 |
),
|
89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
)
|
91 |
|
92 |
|
|
|
6 |
from buster.tokenizers import GPTTokenizer
|
7 |
from buster.validators import QuestionAnswerValidator, Validator
|
8 |
|
|
|
|
|
9 |
buster_cfg = BusterConfig(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
retriever_cfg={
|
11 |
"path": "outputs/deeplake_store",
|
12 |
"top_k": 3,
|
|
|
57 |
"Now answer the following question:\n"
|
58 |
),
|
59 |
},
|
60 |
+
validator_cfg={
|
61 |
+
"unknown_response_templates": [
|
62 |
+
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
|
63 |
+
],
|
64 |
+
"unknown_threshold": 0.85,
|
65 |
+
"embedding_model": "text-embedding-ada-002",
|
66 |
+
"use_reranking": True,
|
67 |
+
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
68 |
+
"check_question_prompt": """You are an chatbot answering questions on python libraries.
|
69 |
+
|
70 |
+
Your job is to determine wether or not a question is valid, and should be answered.
|
71 |
+
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
|
72 |
+
|
73 |
+
For example:
|
74 |
+
|
75 |
+
Q: How can I install the library?
|
76 |
+
true
|
77 |
+
|
78 |
+
Q: What is the meaning of life?
|
79 |
+
false
|
80 |
+
|
81 |
+
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
|
82 |
+
"completion_kwargs": {
|
83 |
+
"model": "gpt-3.5-turbo",
|
84 |
+
"stream": False,
|
85 |
+
"temperature": 0,
|
86 |
+
},
|
87 |
+
},
|
88 |
)
|
89 |
|
90 |
|
rtd_scraper/scrape_rtd.py
CHANGED
@@ -1,16 +1,11 @@
|
|
1 |
import logging
|
2 |
import os
|
3 |
|
4 |
-
from buster.docparser import get_all_documents
|
5 |
-
from buster.documents_manager import DeepLakeDocumentsManager
|
6 |
-
from buster.parser import SphinxParser
|
7 |
from scrapy.crawler import CrawlerProcess
|
8 |
from scrapy.utils.project import get_project_settings
|
9 |
|
10 |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
|
11 |
|
12 |
-
# from tutorial.spiders.docs_spider import DocsSpider
|
13 |
-
|
14 |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
15 |
for name in logging.root.manager.loggerDict:
|
16 |
logger = logging.getLogger(name)
|
@@ -31,51 +26,3 @@ def run_spider(homepage_url, save_directory, target_version=None):
|
|
31 |
|
32 |
# To stop the crawling process gracefully
|
33 |
process.stop()
|
34 |
-
|
35 |
-
|
36 |
-
def scrape_rtd(homepage_url, save_directory, target_version=None):
|
37 |
-
|
38 |
-
# adds https:// and trailing backslash
|
39 |
-
homepage_url = sanitize_url(homepage_url)
|
40 |
-
|
41 |
-
# Crawl the website using scrapy
|
42 |
-
run_spider(
|
43 |
-
homepage_url, save_directory=save_directory, target_version=target_version
|
44 |
-
)
|
45 |
-
|
46 |
-
# # Convert the .html pages into chunks using Buster's SphinxParser
|
47 |
-
root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
|
48 |
-
|
49 |
-
# root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
|
50 |
-
df = get_all_documents(
|
51 |
-
root_dir=root_dir,
|
52 |
-
base_url=homepage_url,
|
53 |
-
parser_cls=SphinxParser,
|
54 |
-
min_section_length=100,
|
55 |
-
max_section_length=1000,
|
56 |
-
)
|
57 |
-
|
58 |
-
# Add the source column
|
59 |
-
df["source"] = "readthedocs"
|
60 |
-
|
61 |
-
# Initialize the DeepLake vector store
|
62 |
-
dm = DeepLakeDocumentsManager(
|
63 |
-
vector_store_path=os.path.join(save_directory, "deeplake_store"),
|
64 |
-
overwrite=True,
|
65 |
-
required_columns=["url", "content", "source", "title"],
|
66 |
-
)
|
67 |
-
|
68 |
-
# Add all embeddings to the vector store
|
69 |
-
dm.batch_add(
|
70 |
-
df=df,
|
71 |
-
batch_size=3000,
|
72 |
-
min_time_interval=60,
|
73 |
-
num_workers=32,
|
74 |
-
)
|
75 |
-
|
76 |
-
|
77 |
-
if __name__ == "__main__":
|
78 |
-
homepage_url = "https://orion.readthedocs.io/"
|
79 |
-
scrape_rtd(
|
80 |
-
homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/"
|
81 |
-
)
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
|
|
|
|
|
|
|
4 |
from scrapy.crawler import CrawlerProcess
|
5 |
from scrapy.utils.project import get_project_settings
|
6 |
|
7 |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
|
8 |
|
|
|
|
|
9 |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
10 |
for name in logging.root.manager.loggerDict:
|
11 |
logger = logging.getLogger(name)
|
|
|
26 |
|
27 |
# To stop the crawling process gracefully
|
28 |
process.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|