fahmiaziz98 commited on
Commit
26de2cd
·
1 Parent(s): 84c30b3
apps/agent/constant.py CHANGED
@@ -23,4 +23,31 @@ PROMPT = ChatPromptTemplate.from_messages(
23
  ),
24
  ("placeholder", "{messages}")
25
  ]
26
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ),
24
  ("placeholder", "{messages}")
25
  ]
26
+ )
27
+
28
+ # list website
29
+ URLS_XANO = [
30
+ "https://docs.xano.com/about",
31
+ "https://releases.xano.com/?_gl=1*sifgtw*_ga*MTI5NTY3MTk5NS4xNzMwNjMzNjY3*_ga_EJWDZRK3CG*MTczMDgwNjg3Mi43LjEuMTczMDgwNjkyMy45LjAuODUyNzA5OTA4",
32
+ "https://docs.xano.com/onboarding-tutorial-reference",
33
+ "https://docs.xano.com/faq",
34
+ "https://docs.xano.com/about",
35
+ "https://docs.xano.com/what-xano-includes",
36
+ "https://docs.xano.com/what-xano-includes/instance",
37
+ "https://docs.xano.com/what-xano-includes/workspace",
38
+ "https://docs.xano.com/database/triggers",
39
+ "https://docs.xano.com/fundamentals/the-development-life-cycle",
40
+
41
+ ]
42
+
43
+ URLS_WEWEB = [
44
+ "https://docs.weweb.io/start-here/welcome.html",
45
+ "https://docs.weweb.io/start-here/frequently-asked-questions.html",
46
+ "https://docs.weweb.io/editor/intro-to-the-editor.html",
47
+ "https://docs.weweb.io/editor/intro-to-html-css.html",
48
+ "https://docs.weweb.io/editor/how-to-use-the-add-panel.html",
49
+ "https://docs.weweb.io/editor/logs.html",
50
+ "https://docs.weweb.io/editor/copilot/import-figma-designs.html",
51
+ "https://docs.weweb.io/editor/app-settings/app-settings.html",
52
+ "https://docs.weweb.io/editor/app-settings/pwa.html"
53
+ ]
apps/agent/multi_query_chain.py DELETED
@@ -1,37 +0,0 @@
1
- from typing import List
2
-
3
- from langchain_core.output_parsers import BaseOutputParser
4
- from langchain_core.prompts import PromptTemplate
5
- from pydantic import BaseModel, Field
6
- from langchain_groq import ChatGroq
7
- from apps.agent.constant import GROQ_API_KEY, MODEL_GROQ
8
-
9
- # Output parser will split the LLM result into a list of queries
10
- class LineListOutputParser(BaseOutputParser[List[str]]):
11
- """Output parser for a list of lines."""
12
-
13
- def parse(self, text: str) -> List[str]:
14
- lines = text.strip().split("\n")
15
- return list(filter(None, lines)) # Remove empty lines
16
-
17
- output_parser = LineListOutputParser()
18
- llm = ChatGroq(model=MODEL_GROQ, groq_api_key=GROQ_API_KEY, temperature=0.1)
19
-
20
- template = """
21
- Your task is to generate 3 different search queries that aim to
22
- answer the user question from multiple perspectives. The user questions
23
- are focused on Large Language Models, Machine Learning, and related
24
- disciplines.
25
- Each query MUST tackle the question from a different viewpoint, we
26
- want to get a variety of RELEVANT search results.
27
- Provide these alternative questions separated by newlines.
28
- GENERATE ONLY QUERY! dont add explanation and word
29
- Original question: {question}
30
- """
31
-
32
- QUERY_PROMPT = PromptTemplate(
33
- input_variables=["question"],
34
- template=template,
35
- )
36
- # Chain
37
- llm_chain = QUERY_PROMPT | llm | output_parser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apps/agent/tools.py CHANGED
@@ -1,39 +1,54 @@
1
  import os
2
- from langchain_community.vectorstores.pinecone import Pinecone
3
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
4
  from langchain.retrievers import ContextualCompressionRetriever
5
  from langchain.retrievers.document_compressors import FlashrankRerank
 
6
  from langchain_core.tools import tool
7
- from langchain.retrievers.multi_query import MultiQueryRetriever
8
-
9
- from apps.agent.multi_query_chain import llm_chain
10
- from apps.agent.constant import INDEX_NAME_WEWEB, INDEX_NAME_XANO
11
 
 
 
 
 
 
 
 
12
 
13
  embeddings = FastEmbedEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
14
  compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
15
 
16
- def multiquery_retriever(index_name: str, embeddings, compressor) -> ContextualCompressionRetriever:
 
17
  vectorstore = Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
18
  retriever = vectorstore.as_retriever()
 
 
 
 
 
 
19
 
20
- multi_retriever = MultiQueryRetriever(
21
- retriever=retriever, llm_chain=llm_chain, parser_key="lines"
22
- )
23
  reranker_retriever = ContextualCompressionRetriever(
24
- base_compressor=compressor, base_retriever=multi_retriever
25
  )
26
  return reranker_retriever
27
 
28
- retriever_xano = multiquery_retriever(INDEX_NAME_XANO, embeddings, compressor)
29
- retriever_weweb = multiquery_retriever(INDEX_NAME_WEWEB, embeddings, compressor)
 
 
 
 
 
30
 
31
  @tool
32
  def tool_xano(query: str):
33
  """
34
  Searches and returns excerpts from the Xano documentation
35
  """
36
- return retriever_xano.get_relevant_documents(query)
37
 
38
 
39
  @tool
@@ -41,4 +56,4 @@ def tool_weweb(query: str):
41
  """
42
  Searches and returns excerpts from the Weweb documentation
43
  """
44
- return retriever_weweb.get_relevant_documents(query)
 
1
  import os
2
+ from langchain_community.vectorstores import Pinecone
3
  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
4
  from langchain.retrievers import ContextualCompressionRetriever
5
  from langchain.retrievers.document_compressors import FlashrankRerank
6
+ from langchain.retrievers import EnsembleRetriever, BM25Retriever
7
  from langchain_core.tools import tool
8
+ from typing import Any
 
 
 
9
 
10
+ from apps.agent.utils import load_and_split_docs
11
+ from apps.agent.constant import (
12
+ INDEX_NAME_WEWEB,
13
+ INDEX_NAME_XANO,
14
+ URLS_WEWEB,
15
+ URLS_XANO,
16
+ )
17
 
18
  embeddings = FastEmbedEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
19
  compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
20
 
21
+ def ensemble_retriever(index_name: str, docs: Any, embeddings, compressor):
22
+ # retriever
23
  vectorstore = Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
24
  retriever = vectorstore.as_retriever()
25
+
26
+ # bm25
27
+ bm25 = BM25Retriever.from_documents(docs)
28
+ bm25.k = 6
29
+ ensemble_retriever = EnsembleRetriever(retrievers=[retriever, bm25],
30
+ weights=[0.6, 0.4])
31
 
32
+ # reranker
 
 
33
  reranker_retriever = ContextualCompressionRetriever(
34
+ base_compressor=compressor, base_retriever=ensemble_retriever
35
  )
36
  return reranker_retriever
37
 
38
+ # load data
39
+ data_xano = load_and_split_docs(URLS_XANO)
40
+ data_weweb = load_and_split_docs(URLS_WEWEB)
41
+
42
+ # create retriever
43
+ retriever_xano = ensemble_retriever(INDEX_NAME_XANO, data_xano, embeddings, compressor)
44
+ retriever_weweb = ensemble_retriever(INDEX_NAME_WEWEB, data_weweb, embeddings, compressor)
45
 
46
  @tool
47
  def tool_xano(query: str):
48
  """
49
  Searches and returns excerpts from the Xano documentation
50
  """
51
+ return retriever_xano.invoke(query)
52
 
53
 
54
  @tool
 
56
  """
57
  Searches and returns excerpts from the Weweb documentation
58
  """
59
+ return retriever_weweb.invoke(query)
apps/agent/utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import logging
3
+ from typing import List
4
+ from langchain_community.document_loaders import SeleniumURLLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+
7
+ # add logger
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+ # get document
11
+ def load_and_split_docs(urls: List[str]):
12
+
13
+ MARKDOWN_SEPARATORS = [
14
+ "\n#{1,6} ",
15
+ "```\n",
16
+ "\n\\*\\*\\*+\n",
17
+ "\n---+\n",
18
+ "\n___+\n",
19
+ "\n\n",
20
+ "\n",
21
+ " ",
22
+ "",
23
+ ]
24
+ logger.info("Extracting web loader...")
25
+ loader = SeleniumURLLoader(urls=urls)
26
+ docs = loader.load()
27
+
28
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
29
+ chunk_size=512, # The maximum number of characters in a chunk: we selected this value arbitrarily
30
+ chunk_overlap=50, # The number of characters to overlap between chunks
31
+ add_start_index=True, # If `True`, includes chunk's start index in metadata
32
+ strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
33
+ separators=MARKDOWN_SEPARATORS,
34
+ )
35
+
36
+ logger.info("Split and documnets...")
37
+ docs_split = text_splitter.split_documents(docs)
38
+ for i, doc in enumerate(docs_split):
39
+ doc.metadata['id'] = str(uuid.uuid4())[:4]
40
+ doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:]
41
+ return docs_split