File size: 3,142 Bytes
e9c0973
d708cb9
e9c0973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d708cb9
e9c0973
 
 
 
 
d708cb9
e9c0973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d708cb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import re
from typing import Tuple, List
from dotenv import load_dotenv
from msal import ConfidentialClientApplication
from langchain_openai import AzureChatOpenAI
from langchain.schema import format_document


def init_env():
    try:
        load_dotenv()
    except:
        pass


def get_token() -> str | None:
    app = ConfidentialClientApplication(
        client_id=os.getenv("CLIENT_ID"),
        client_credential=os.getenv("CLIENT_SECRET"),
        authority=f"https://login.microsoftonline.com/{os.getenv('TENANT_ID')}",
    )
    result = app.acquire_token_for_client(scopes=[os.getenv("SCOPE")])
    if result is not None:
        return result["access_token"]


def get_llm():
    os.environ["OPENAI_API_KEY"] = get_token()
    os.environ["AZURE_OPENAI_ENDPOINT"] = (
        f"{os.getenv('OPENAI_API_ENDPOINT')}{os.getenv('DEPLOYMENT_ID')}/chat/completions?api-version={os.getenv('OPENAI_API_VERSION')}"
    )

    return AzureChatOpenAI()


def _combine_documents(docs, document_prompt, document_separator="\n\n"):
    doc_strings = [
        f"Document {i}: \n'''\n{format_document(doc, document_prompt)}\n'''"
        for i, doc in enumerate(docs, 1)
    ]
    return document_separator.join(doc_strings)


def _format_chat_history(chat_history: List[Tuple]) -> str:
    turn = 1
    buffer = []
    for dialogue in chat_history:
        buffer.append(("Human: " if turn else "Assistant: ") + dialogue.content)
        turn ^= 1
    return "\n".join(buffer) + "\n"


def make_pairs(lst):
    """from a list of even lenght, make tupple pairs"""
    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]


def make_html_source(i, doc):
    if doc.metadata["source"] == "ESRS":
        return f"""
<div class="card" id="doc{i}">
    <div class="card-content">
        <h3>Doc {i}</h2>
        <p>{doc.page_content}</p>
    </div>
    <div class="card-footer">
        <span>{doc.metadata['ESRS_filter']} \n</span>
        <span>DR: {doc.metadata['DR']} \n</span>
        <span>Data type: {doc.metadata['Data type']} \n</span>
    </div>
</div>
    """
    else:
        return f"""
<div class="card">
    <div class="card-content">
        <h3>Doc {i}</h2>
        <p>{doc.page_content}</p>
    </div>
    <div class="card-footer">
        <span>Source: {doc.metadata['source']} \n</span>
    </div>
</div>
    """


def parse_output_llm_with_sources(output):
    # Split the content into a list of text and "[Doc X]" references
    content_parts = re.split(r"\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]", output)
    parts = []
    for part in content_parts:
        if part.startswith("Doc"):
            subparts = part.split(",")
            subparts = [
                subpart.lower().replace("doc", "").strip() for subpart in subparts
            ]
            subparts = [
                f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>"""
                for subpart in subparts
            ]
            parts.append("".join(subparts))
        else:
            parts.append(part)
    content_parts = "".join(parts)
    return content_parts