Spaces:

Ekimetrics
/

celsius-csrd-chatbot

Running

App Files Files Community

momenaca commited on Aug 14, 2024

Commit

7bfa7e6

1 Parent(s): 0cefe4a

update unfinished agent

Browse files

Files changed (6) hide show

celsius_csrd_chatbot/agent.py +74 -0
celsius_csrd_chatbot/chains/__init__.py +0 -0
celsius_csrd_chatbot/chains/answer_rag.py +96 -0
celsius_csrd_chatbot/chains/esrs_categorization.py +121 -0
celsius_csrd_chatbot/chains/esrs_intent.py +121 -0
celsius_csrd_chatbot/chains/retriever.py +23 -0

celsius_csrd_chatbot/agent.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import sys
+import os
+from contextlib import contextmanager
+from langchain.schema import Document
+from langgraph.graph import END, StateGraph
+from langchain_core.runnables.graph import MermaidDrawMethod
+from typing_extensions import TypedDict
+from typing import List
+from IPython.display import display, HTML, Image
+from celsius_csrd_chatbot.chains.esrs_categorization import (
+    make_esrs_categorization_node,
+)
+from celsius_csrd_chatbot.chains.retriever import make_retriever_node
+from celsius_csrd_chatbot.chains.answer_rag import make_rag_node
+class GraphState(TypedDict):
+    """
+    Represents the state of our graph.
+    """
+    query: str
+    esrs_type: str
+    answer: str
+def route_intent(state):
+    esrs = state["esrs_type"]
+    if esrs == "none":
+        return "intent_esrs"
+    else:
+        return "retrieve_documents"
+def make_graph_agent(llm, vectorstore):
+    workflow = StateGraph(GraphState)
+    # Define the node functions
+    categorize_esrs = make_esrs_categorization_node(llm)
+    retrieve_documents = make_retriever_node(vectorstore)
+    answer_rag = make_rag_node(llm)
+    # Define the nodes
+    workflow.add_node("categorize_esrs", categorize_esrs)
+    workflow.add_node("retrieve_documents", retrieve_documents)
+    workflow.add_node("answer_rag", answer_rag)
+    # Entry point
+    workflow.set_entry_point("categorize_esrs")
+    # Define the edges
+    workflow.add_edge("categorize_esrs", "retrieve_documents")
+    workflow.add_edge("retrieve_documents", "answer_rag")
+    workflow.add_edge("answer_rag", END)
+    # Compile
+    app = workflow.compile()
+    return app
+def display_graph(app):
+    display(
+        Image(
+            app.get_graph(xray=True).draw_mermaid_png(
+                draw_method=MermaidDrawMethod.API,
+            )
+        )
+    )

celsius_csrd_chatbot/chains/__init__.py ADDED Viewed

File without changes

celsius_csrd_chatbot/chains/answer_rag.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from operator import itemgetter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts.prompt import PromptTemplate
+from langchain_core.prompts.base import format_document
+esrs_wiki = """
+The Corporate Sustainability Reporting Directive (CSRD) is a mandate that requires all companies to report on their sustainability initiatives. In response to this directive, the European Sustainability Reporting Standards (ESRS) were developed. These standards are a key tool in promoting the transition to a sustainable economy within the EU, providing a structured framework for companies to disclose their sustainability initiatives. The ESRS cover a wide range of environmental, social, and governance (ESG) issues, including climate change, biodiversity, and human rights. Companies that adhere to the ESRS can provide investors with valuable insights into their sustainability impact, thereby informing investment decisions. The ESRS are designed to be highly interoperable with global reporting standards, which helps to avoid unnecessary duplication in reporting by companies. The reporting requirements based on the ESRS will be gradually implemented for different companies over time. In summary, the ESRS play a critical role in fostering sustainable finance and enabling companies to demonstrate their commitment to the green deal agenda while accessing sustainable finance.
+---
+"""
+reformulation_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(reformulation_template)
+answering_template = """
+    You are an ESG expert, with 20 years experience analyzing corporate sustainability reports.
+    You are specialist in the upcoming CSRD regulation and in general with corporate sustainability disclosure requirements.
+    {esrs_wiki}
+    You will answer the question based on the following passages extracted from CSRD specific sustainability guidelines and reports:
+    ```
+    {context}
+    ```
+    Guidelines:
+    1. Context: You'll receive relevant excerpts from a CSRD-specific sustainability guideline or report to address a given question.
+    2. Relevance: Only include passages directly pertaining to the question; omit irrelevant content.
+    3. Facts and Figures: Prioritize factual information in your response.
+    4. Conciseness: Keep answers sharp and succinct, avoiding unnecessary context.
+    5. Focus: Address the specific question without veering into related topics.
+    6. Honesty: If unsure, state that you don't know rather than inventing an answer.
+    7. Source Attribution: When using information from a passage, mention it as [Doc i] at the end of the sentence (where 'i' represents the document number).
+    8. Multiple Sources: If the same content appears in multiple documents, cite them collectively (e.g., [Doc i, Doc j, Doc k]).
+    9. Structured Paragraphs: Instead of bullet-point summaries, compile your responses into well-structured paragraphs.
+    10. Method Focus: When addressing "how" questions, emphasize methods and procedures over outcomes.
+    11. Selective Usage: You're not obligated to use every passage; include only those relevant to the question.
+    12. Insufficient Information: If documents lack necessary details, indicate that you don't have enough information.
+    Question: {question}
+    Answer:
+    """
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
+def _combine_documents(docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"):
+    doc_strings = []
+    for i, doc in enumerate(docs):
+        chunk_type = "Doc"
+        if isinstance(doc, str):
+            doc_formatted = doc
+        else:
+            doc_formatted = format_document(doc, document_prompt)
+        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
+        doc_string = doc_string.replace("\n", " ")
+        doc_strings.append(doc_string)
+    return sep.join(doc_strings)
+def get_text_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
+def make_rag_chain(llm):
+    prompt = ChatPromptTemplate.from_template(answering_template)
+    chain = (
+        {
+            "context": lambda x: _combine_documents(x["documents"]),
+            "query": itemgetter("query"),
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+    return chain
+def make_rag_node(llm):
+    rag_chain = make_rag_chain(llm)
+    async def answer_rag(state, config):
+        answer = await rag_chain.ainvoke(state, config)
+        return {"answer": answer}
+    return answer_rag

celsius_csrd_chatbot/chains/esrs_categorization.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain.prompts.prompt import PromptTemplate
+from langchain.output_parsers import PydanticOutputParser
+from typing import Literal
+from operator import itemgetter
+import json
+from langchain_core.exceptions import OutputParserException
+class ESRSAnalysis(BaseModel):
+    """Analyzing the user query to get ESRS type, sources and intent"""
+    esrs_type: Literal[
+        "ESRS 1",
+        "ESRS 2",
+        "ESRS E1",
+        "ESRS E2",
+        "ESRS E3",
+        "ESRS E4",
+        "ESRS E5",
+        "ESRS S1",
+        "ESRS S2",
+        "ESRS S3",
+        "ESRS S4",
+        "ESRS G1",
+        "none",
+    ] = Field(
+        description="""
+            Given a user question choose which documents would be most relevant for answering their question :
+            - ESRS 1 is for questions about general principles for preparing and presenting sustainability information in accordance with CSRD
+            - ESRS 2 is for questions about general disclosures related to sustainability reporting, including governance, strategy, impact, risk, opportunity management, and metrics and targets
+            - ESRS E1 is for questions about climate change, global warming, GES and energy
+            - ESRS E2 is for questions about air, water, and soil pollution, and dangerous substances
+            - ESRS E3 is for questions about water and marine resources
+            - ESRS E4 is for questions about biodiversity, nature, wildlife and ecosystems
+            - ESRS E5 is for questions about resource use and circular economy
+            - ESRS S1 is for questions about workforce and labor issues, job security, fair pay, and health and safety
+            - ESRS S2 is for questions about workers in the value chain, workers' treatment
+            - SRS S3 is for questions about affected communities, impact on local communities
+            - ESRS S4 is for questions about consumers and end users, customer privacy, safety, and inclusion
+            - ESRS G1 is for questions about governance, risk management, internal control, and business conduct
+            - none is for questions that do not fit into any of the above categories
+            Follow these guidelines :
+            - Do not take into account upper or lower case letter distinction. For example, 'esrs 1', 'Esrs 1' and 'ESRS 1' should be considered as 'ESRS 1'.
+            - Some questions could be related to multiple ESRS. In this case, keep all options and format the output as such : 'ESRS 1', 'ESRS 2'.
+            - Remember, if the question is not related to any ESRS, the output should be 'none'.
+        """,
+    )
+def make_esrs_categorization_chain(llm):
+    parser = PydanticOutputParser(pydantic_object=ESRSAnalysis)
+    prompt_template = """
+    The following question is about ESRS related topics. Please analyze the question and indicate if it refers to specific ESRS.
+    {format_instructions}
+    Please answer with the appropriate ESRS to answer the question.
+    Question: '{query}'
+    Answer:
+    """
+    prompt = PromptTemplate(
+        template=prompt_template,
+        input_variables=["query"],
+        partial_variables={"format_instructions": parser.get_format_instructions()},
+    )
+    chain = {"query": itemgetter("query")} | prompt | llm | parser
+    return chain
+def make_esrs_categorization_node(llm):
+    def categorize_message(state):
+        query = state["query"]
+        categorization_chain = make_esrs_categorization_chain(llm)
+        output = categorization_chain.invoke(query)
+        if not output:
+            raise OutputParserException("Output is empty")
+        try:
+            # Attempt to parse the output as JSON
+            parsed_output = json.loads(output)
+        except json.JSONDecodeError as e:
+            # Raise a more informative error if the output is not valid JSON
+            raise OutputParserException(f"Invalid JSON output: {output}") from e
+        return output
+    return categorize_message
+    # intent: str = Field(
+    #     enum=[
+    #         "Specific topic",
+    #         "Implementation reco",
+    #         "KPI extraction",
+    #     ],
+    #     description="""
+    #         Categorize the user query in one of the following categories,
+    #         Examples:
+    #         - Specific topic: "What are the specificities of ESRS E1 ?"
+    #         - Implementation reco: "How should I compute my scope 1 reduction target ?"
+    #         - KPI extraction: "When will the CSRD be mandatory for my small French company ?"
+    #     """,
+    # )
+    # sources: str = Field(
+    #     enum=["ESRS", "External"],
+    #     description="""
+    #         Given a user question choose which documents would be most relevant for answering their question,
+    #         - ESRS is for questions about a specific environmental, social or governance topic, as well as CSRD's general principles and disclosures
+    #         - External is for questions about how to implement the CSRD, or general questions about CSRD's context
+    #     """,
+    # )

celsius_csrd_chatbot/chains/esrs_intent.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain.prompts.prompt import PromptTemplate
+from langchain.output_parsers import PydanticOutputParser
+from typing import Literal
+from operator import itemgetter
+import json
+from langchain_core.exceptions import OutputParserException
+class ESRSAnalysis(BaseModel):
+    """Analyzing the user query to get ESRS type, sources and intent"""
+    esrs_type: Literal[
+        "ESRS 1",
+        "ESRS 2",
+        "ESRS E1",
+        "ESRS E2",
+        "ESRS E3",
+        "ESRS E4",
+        "ESRS E5",
+        "ESRS S1",
+        "ESRS S2",
+        "ESRS S3",
+        "ESRS S4",
+        "ESRS G1",
+        "none",
+    ] = Field(
+        description="""
+            Given a user question choose which documents would be most relevant for answering their question :
+            - ESRS 1 is for questions about general principles for preparing and presenting sustainability information in accordance with CSRD
+            - ESRS 2 is for questions about general disclosures related to sustainability reporting, including governance, strategy, impact, risk, opportunity management, and metrics and targets
+            - ESRS E1 is for questions about climate change, global warming, GES and energy
+            - ESRS E2 is for questions about air, water, and soil pollution, and dangerous substances
+            - ESRS E3 is for questions about water and marine resources
+            - ESRS E4 is for questions about biodiversity, nature, wildlife and ecosystems
+            - ESRS E5 is for questions about resource use and circular economy
+            - ESRS S1 is for questions about workforce and labor issues, job security, fair pay, and health and safety
+            - ESRS S2 is for questions about workers in the value chain, workers' treatment
+            - SRS S3 is for questions about affected communities, impact on local communities
+            - ESRS S4 is for questions about consumers and end users, customer privacy, safety, and inclusion
+            - ESRS G1 is for questions about governance, risk management, internal control, and business conduct
+            - none is for questions that do not fit into any of the above categories
+            Follow these guidelines :
+            - Do not take into account upper or lower case letter distinction. For example, 'esrs 1', 'Esrs 1' and 'ESRS 1' should be considered as 'ESRS 1'.
+            - Some questions could be related to multiple ESRS. In this case, keep all options and format the output as such : 'ESRS 1', 'ESRS 2'.
+            - Remember, if the question is not related to any ESRS, the output should be 'none'.
+        """,
+    )
+def make_esrs_categorization_chain(llm):
+    parser = PydanticOutputParser(pydantic_object=ESRSAnalysis)
+    prompt_template = """
+    The following question is about ESRS related topics. Please analyze the question and indicate if it refers to specific ESRS.
+    {format_instructions}
+    Please answer with the appropriate ESRS to answer the question.
+    Question: '{query}'
+    Answer:
+    """
+    prompt = PromptTemplate(
+        template=prompt_template,
+        input_variables=["query"],
+        partial_variables={"format_instructions": parser.get_format_instructions()},
+    )
+    chain = {"query": itemgetter("query")} | prompt | llm | parser
+    return chain
+def make_esrs_categorization_node(llm):
+    def categorize_message(state):
+        query = state["query"]
+        categorization_chain = make_esrs_categorization_chain(llm)
+        output = categorization_chain.invoke(query)
+        if not output:
+            raise OutputParserException("Output is empty")
+        try:
+            # Attempt to parse the output as JSON
+            parsed_output = json.loads(output)
+        except json.JSONDecodeError as e:
+            # Raise a more informative error if the output is not valid JSON
+            raise OutputParserException(f"Invalid JSON output: {output}") from e
+        return output
+    return categorize_message
+    # intent: str = Field(
+    #     enum=[
+    #         "Specific topic",
+    #         "Implementation reco",
+    #         "KPI extraction",
+    #     ],
+    #     description="""
+    #         Categorize the user query in one of the following categories,
+    #         Examples:
+    #         - Specific topic: "What are the specificities of ESRS E1 ?"
+    #         - Implementation reco: "How should I compute my scope 1 reduction target ?"
+    #         - KPI extraction: "When will the CSRD be mandatory for my small French company ?"
+    #     """,
+    # )
+    # sources: str = Field(
+    #     enum=["ESRS", "External"],
+    #     description="""
+    #         Given a user question choose which documents would be most relevant for answering their question,
+    #         - ESRS is for questions about a specific environmental, social or governance topic, as well as CSRD's general principles and disclosures
+    #         - External is for questions about how to implement the CSRD, or general questions about CSRD's context
+    #     """,
+    # )

celsius_csrd_chatbot/chains/retriever.py ADDED Viewed

	@@ -0,0 +1,23 @@

+def make_retriever_node(vectorstore, k=10):
+    def retrieve_documents(state):
+        sources = state["esrs_type"]
+        query = state["query"]
+        if sources == "none":
+            filters_full = {}
+        else:
+            filters_full = {"ESRS": {"$in": [sources]}}
+        docs = []
+        retriever = vectorstore.as_retriever()
+        docs = retriever.similarity_search_with_score(
+            query=query, filter=filters_full, k=k
+        )
+        for doc in docs:
+            doc.metadata["similarity_score"] = doc.metadata["similarity_score"]
+        docs = sorted(docs, key=lambda x: x.metadata["similarity_score"], reverse=True)
+        new_state = {"documents": docs}
+        return new_state
+    return retrieve_documents