File size: 3,158 Bytes
3715692
 
 
 
 
1683c72
3715692
 
bb49240
3715692
 
 
 
 
 
 
 
bb49240
 
3715692
 
 
bb49240
3715692
 
 
bb49240
 
 
 
 
3715692
bb49240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3daa353
 
bb49240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from sentence_transformers import SentenceTransformer
from wikipediaapi import Wikipedia
import textwrap
import numpy as np
from openai import OpenAI

# Function to process the input and generate the output
def process_query(wiki_page, model_name, embed_dim, query, api_key):
    model_mapping = {
        "Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
        "Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
        "Arabert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka",
        "Arabic-labse-Matryoshka": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
        "Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka"
    }

    model_path = model_mapping[model_name]
    model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim)
    wiki = Wikipedia('RAGBot/0.0', 'ar')
    doc = wiki.page(wiki_page).text
    paragraphs = doc.split('\n\n')  # chunking

    for i, p in enumerate(paragraphs):
        wrapped_text = textwrap.fill(p, width=100)

    docs_embed = model.encode(paragraphs, normalize_embeddings=True)
    query_embed = model.encode(query, normalize_embeddings=True)
    similarities = np.dot(docs_embed, query_embed.T)
    top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()
    most_similar_documents = [paragraphs[idx] for idx in top_3_idx]

    CONTEXT = ""
    for i, p in enumerate(most_similar_documents):
        wrapped_text = textwrap.fill(p, width=100)
        CONTEXT += wrapped_text + "\n\n"

    prompt = f"""
        use the following CONTEXT to answer the QUESTION at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.
        CONTEXT: {CONTEXT}
        QUESTION: {query}
    """

    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt},
        ]
    )

    return response.choices[0].message.content

# Define the interface
wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)")
query_input = gr.Textbox(label="Query (in Arabic)")
api_key_input = gr.Textbox(label="OpenAI API Key", type="password")

model_choice = gr.Dropdown(
    choices=[
        "Arabic-mpnet-base-all-nli-triplet",
        "Arabic-all-nli-triplet-Matryoshka",
        "Arabert-all-nli-triplet-Matryoshka",
        "Arabic-labse-Matryoshka",
        "Marbert-all-nli-triplet-Matryoshka"
    ], 
    label="Choose Embedding Model"
)

embed_dim_choice = gr.Dropdown(
    choices=[768, 512, 256, 128, 64],
    label="Embedding Dimension"
)

output_text = gr.Textbox(label="Output")

gr.Interface(
    fn=process_query,
    inputs=[wiki_page_input, model_choice, embed_dim_choice, query_input, api_key_input],
    outputs=output_text,
    title="Arabic Wiki RAG",
    description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic."
).launch()