|
import os |
|
import re |
|
from typing import Tuple, List |
|
from dotenv import load_dotenv |
|
from msal import ConfidentialClientApplication |
|
from langchain.schema import format_document |
|
|
|
|
|
def init_env(): |
|
try: |
|
load_dotenv() |
|
except: |
|
pass |
|
|
|
|
|
def get_token() -> str | None: |
|
app = ConfidentialClientApplication( |
|
client_id=os.getenv("CLIENT_ID"), |
|
client_credential=os.getenv("CLIENT_SECRET"), |
|
authority=f"https://login.microsoftonline.com/{os.getenv('TENANT_ID')}", |
|
) |
|
result = app.acquire_token_for_client(scopes=[os.getenv("SCOPE")]) |
|
if result is not None: |
|
return result["access_token"] |
|
|
|
|
|
def get_llm(): |
|
os.environ["OPENAI_API_KEY"] = get_token() |
|
os.environ["AZURE_OPENAI_ENDPOINT"] = ( |
|
f"{os.getenv('OPENAI_API_ENDPOINT')}{os.getenv('DEPLOYMENT_ID')}/chat/completions?api-version={os.getenv('OPENAI_API_VERSION')}" |
|
) |
|
|
|
return AzureChatOpenAI() |
|
|
|
|
|
def _combine_documents(docs, document_prompt, document_separator="\n\n"): |
|
doc_strings = [ |
|
f"Document {i}: \n'''\n{format_document(doc, document_prompt)}\n'''" |
|
for i, doc in enumerate(docs, 1) |
|
] |
|
return document_separator.join(doc_strings) |
|
|
|
|
|
def _format_chat_history(chat_history: List[Tuple]) -> str: |
|
turn = 1 |
|
buffer = [] |
|
for dialogue in chat_history: |
|
buffer.append(("Human: " if turn else "Assistant: ") + dialogue.content) |
|
turn ^= 1 |
|
return "\n".join(buffer) + "\n" |
|
|
|
|
|
def make_pairs(lst): |
|
"""from a list of even lenght, make tupple pairs""" |
|
return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)] |
|
|
|
|
|
def make_html_source(i, doc): |
|
if doc.metadata["source"] == "ESRS": |
|
return f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h3>Doc {i}</h2> |
|
<p>{doc.page_content}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{doc.metadata['ESRS_filter']} \n</span> |
|
<span>DR: {doc.metadata['DR']} \n</span> |
|
<span>Data type: {doc.metadata['Data type']} \n</span> |
|
</div> |
|
</div> |
|
""" |
|
else: |
|
return f""" |
|
<div class="card"> |
|
<div class="card-content"> |
|
<h3>Doc {i}</h2> |
|
<p>{doc.page_content}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>Source: {doc.metadata['source']} \n</span> |
|
</div> |
|
</div> |
|
""" |
|
|
|
|
|
def parse_output_llm_with_sources(output): |
|
|
|
content_parts = re.split(r"\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]", output) |
|
parts = [] |
|
for part in content_parts: |
|
if part.startswith("Doc"): |
|
subparts = part.split(",") |
|
subparts = [ |
|
subpart.lower().replace("doc", "").strip() for subpart in subparts |
|
] |
|
subparts = [ |
|
f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" |
|
for subpart in subparts |
|
] |
|
parts.append("".join(subparts)) |
|
else: |
|
parts.append(part) |
|
content_parts = "".join(parts) |
|
return content_parts |
|
|