Spaces:
Sleeping
Sleeping
import os | |
import re | |
from typing import Tuple, List | |
from dotenv import load_dotenv | |
from msal import ConfidentialClientApplication | |
from langchain.schema import format_document | |
def init_env(): | |
try: | |
load_dotenv() | |
except: | |
pass | |
def get_token() -> str | None: | |
app = ConfidentialClientApplication( | |
client_id=os.getenv("CLIENT_ID"), | |
client_credential=os.getenv("CLIENT_SECRET"), | |
authority=f"https://login.microsoftonline.com/{os.getenv('TENANT_ID')}", | |
) | |
result = app.acquire_token_for_client(scopes=[os.getenv("SCOPE")]) | |
if result is not None: | |
return result["access_token"] | |
def get_llm(): | |
os.environ["OPENAI_API_KEY"] = get_token() | |
os.environ["AZURE_OPENAI_ENDPOINT"] = ( | |
f"{os.getenv('OPENAI_API_ENDPOINT')}{os.getenv('DEPLOYMENT_ID')}/chat/completions?api-version={os.getenv('OPENAI_API_VERSION')}" | |
) | |
return AzureChatOpenAI() | |
def _combine_documents(docs, document_prompt, document_separator="\n\n"): | |
doc_strings = [ | |
f"Document {i}: \n'''\n{format_document(doc, document_prompt)}\n'''" | |
for i, doc in enumerate(docs, 1) | |
] | |
return document_separator.join(doc_strings) | |
def _format_chat_history(chat_history: List[Tuple]) -> str: | |
turn = 1 | |
buffer = [] | |
for dialogue in chat_history: | |
buffer.append(("Human: " if turn else "Assistant: ") + dialogue.content) | |
turn ^= 1 | |
return "\n".join(buffer) + "\n" | |
def make_pairs(lst): | |
"""from a list of even lenght, make tupple pairs""" | |
return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)] | |
def make_html_source(i, doc): | |
if doc.metadata["source"] == "ESRS": | |
return f""" | |
<div class="card" id="doc{i}"> | |
<div class="card-content"> | |
<h3>Doc {i}</h2> | |
<p>{doc.page_content}</p> | |
</div> | |
<div class="card-footer"> | |
<span>{doc.metadata['ESRS_filter']} \n</span> | |
<span>DR: {doc.metadata['DR']} \n</span> | |
<span>Data type: {doc.metadata['Data type']} \n</span> | |
</div> | |
</div> | |
""" | |
else: | |
return f""" | |
<div class="card"> | |
<div class="card-content"> | |
<h3>Doc {i}</h2> | |
<p>{doc.page_content}</p> | |
</div> | |
<div class="card-footer"> | |
<span>Source: {doc.metadata['source']} \n</span> | |
</div> | |
</div> | |
""" | |
def parse_output_llm_with_sources(output): | |
# Split the content into a list of text and "[Doc X]" references | |
content_parts = re.split(r"\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]", output) | |
parts = [] | |
for part in content_parts: | |
if part.startswith("Doc"): | |
subparts = part.split(",") | |
subparts = [ | |
subpart.lower().replace("doc", "").strip() for subpart in subparts | |
] | |
subparts = [ | |
f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" | |
for subpart in subparts | |
] | |
parts.append("".join(subparts)) | |
else: | |
parts.append(part) | |
content_parts = "".join(parts) | |
return content_parts | |