Ocillus commited on
Commit
b790fc2
·
verified ·
1 Parent(s): 639dede

Upload 2 files

Browse files
Files changed (2) hide show
  1. Arcana.py +12 -20
  2. ArcanaUI2.py +219 -0
Arcana.py CHANGED
@@ -10,13 +10,18 @@ from nylon import ChatDatabase, get_keywords
10
  def extract_text_from_pdf(pdf_path):
11
  output_string = io.StringIO()
12
  with open(pdf_path, 'rb') as fin:
13
- extract_text_to_fp(fin, output_string, laparams=LAParams(),
 
14
  output_type='text', codec='utf-8')
15
  return output_string.getvalue()
16
 
17
  def process_text_into_paragraphs(text):
 
 
 
 
18
  # Split text into paragraphs
19
- paragraphs = re.split(r'\n\s*\n', text)
20
 
21
  # Clean up each paragraph
22
  cleaned_paragraphs = []
@@ -24,6 +29,7 @@ def process_text_into_paragraphs(text):
24
  # Remove extra whitespace and join broken words
25
  cleaned_para = re.sub(r'\s+', ' ', para).strip()
26
  cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
 
27
  if cleaned_para: # Only add non-empty paragraphs
28
  cleaned_paragraphs.append(cleaned_para)
29
 
@@ -45,13 +51,14 @@ def process_pdfs(directory, db):
45
  paragraphs = process_text_into_paragraphs(text)
46
 
47
  for paragraph in paragraphs:
48
- db.add_message(sender, fixed_timestamp, paragraph, tag)
 
49
 
50
  pbar.update(1)
51
  pbar.set_postfix({"Current File": filename})
52
 
53
  def main():
54
- db_filename = 'textbooks.txt'
55
 
56
  if os.path.exists(db_filename):
57
  print(f"Database file '{db_filename}' already exists. Loading existing database...")
@@ -59,7 +66,7 @@ def main():
59
  else:
60
  print(f"Creating new database '{db_filename}'...")
61
  db = ChatDatabase(db_filename)
62
- pdf_directory = 'pdfdemos'
63
 
64
  start_time = time.time()
65
  process_pdfs(pdf_directory, db)
@@ -68,20 +75,5 @@ def main():
68
  total_time = end_time - start_time
69
  print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
70
 
71
- # Example query
72
- query = "NaCl"
73
- sender = "Arcana" # Now all senders are "Arcana"
74
- N = 5
75
- cache = {}
76
- query_tag = "Chemistry2e-WEB" # Use the PDF name as the tag for querying
77
-
78
- relevant_messages = db.get_relevant_messages(sender, query, N, cache, query_tag)
79
-
80
- print(f"\nTop {N} relevant paragraphs for query '{query}' with tag '{query_tag}':")
81
- for message in relevant_messages:
82
- print(f"From {message[0]} at {message[1]}:")
83
- print(f"Tag: {message[3]}")
84
- print(message[2][:200] + "...\n")
85
-
86
  if __name__ == "__main__":
87
  main()
 
10
  def extract_text_from_pdf(pdf_path):
11
  output_string = io.StringIO()
12
  with open(pdf_path, 'rb') as fin:
13
+ laparams = LAParams(line_margin=1.5, char_margin=2.0, word_margin=0.1)
14
+ extract_text_to_fp(fin, output_string, laparams=laparams,
15
  output_type='text', codec='utf-8')
16
  return output_string.getvalue()
17
 
18
  def process_text_into_paragraphs(text):
19
+ # Remove page numbers and headers/footers
20
+ text = re.sub(r'\n\d+\n', '\n', text)
21
+ text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
22
+
23
  # Split text into paragraphs
24
+ paragraphs = re.split(r'\n{2,}', text)
25
 
26
  # Clean up each paragraph
27
  cleaned_paragraphs = []
 
29
  # Remove extra whitespace and join broken words
30
  cleaned_para = re.sub(r'\s+', ' ', para).strip()
31
  cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
32
+ cleaned_para = re.sub(r'(\w+)\s*\n\s*(\w+)', r'\1 \2', cleaned_para)
33
  if cleaned_para: # Only add non-empty paragraphs
34
  cleaned_paragraphs.append(cleaned_para)
35
 
 
51
  paragraphs = process_text_into_paragraphs(text)
52
 
53
  for paragraph in paragraphs:
54
+ print(paragraph)
55
+ db.add_message(sender, fixed_timestamp, str(paragraph), tag)
56
 
57
  pbar.update(1)
58
  pbar.set_postfix({"Current File": filename})
59
 
60
  def main():
61
+ db_filename = 'memory.txt'
62
 
63
  if os.path.exists(db_filename):
64
  print(f"Database file '{db_filename}' already exists. Loading existing database...")
 
66
  else:
67
  print(f"Creating new database '{db_filename}'...")
68
  db = ChatDatabase(db_filename)
69
+ pdf_directory = 'cache'
70
 
71
  start_time = time.time()
72
  process_pdfs(pdf_directory, db)
 
75
  total_time = end_time - start_time
76
  print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if __name__ == "__main__":
79
  main()
ArcanaUI2.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import ssl
3
+ from openai import OpenAI
4
+ import time
5
+ import os
6
+ import shutil
7
+ from datetime import datetime
8
+ import Arcana
9
+
10
+ # SSL configuration to avoid verification issues
11
+ try:
12
+ _create_unverified_https_context = ssl._create_unverified_context
13
+ except AttributeError:
14
+ pass
15
+ else:
16
+ ssl._create_default_https_context = _create_unverified_https_context
17
+
18
+
19
+ # OpenAI client setup
20
+ client = OpenAI(
21
+ base_url='https://api.openai-proxy.org/v1',
22
+ api_key='sk-Nxf8HmLpfIMhCd83n3TOr00TR57uBZ0jMbAgGCOzppXvlsx1',
23
+ )
24
+
25
+ # Retry logic for OpenAI API call
26
+ def openai_api_call(messages, retries=3, delay=5):
27
+ for attempt in range(retries):
28
+ try:
29
+ completion = client.chat.completions.create(
30
+ model="gpt-4o",
31
+ messages=messages,
32
+ timeout=10
33
+ )
34
+ return completion.choices[0].message.content
35
+ except Exception as e:
36
+ print(f"Attempt {attempt + 1} failed: {e}")
37
+ time.sleep(delay)
38
+ return "Sorry, I am having trouble connecting to the server. Please try again later."
39
+
40
+ # Chatbot response function
41
+ def chatbot_response(message, history):
42
+ messages = [{"role": "system", "content": '''You are Arcana, a dynamic study resource database designed to help students excel in their exams. Your responses should be accurate, informative, and evidence-based whenever possible. Follow these guidelines:
43
+ Your primary goal is to provide students with the most helpful and accurate study information, utilizing both your internal knowledge and the PDF resources at your disposal.'''}]
44
+
45
+ for human, assistant in history:
46
+ messages.append({"role": "user", "content": human})
47
+ messages.append({"role": "assistant", "content": assistant})
48
+ messages.append({"role": "user", "content": message})
49
+
50
+ response = openai_api_call(messages)
51
+
52
+ return response
53
+
54
+ selected = None
55
+
56
+ def upload_file(file):
57
+ foldername = 'cache'
58
+ if not os.path.exists(foldername):
59
+ os.mkdir(foldername)
60
+ file_path = os.path.join(foldername, os.path.basename(file.name))
61
+ shutil.copy(file.name, file_path)
62
+ return list_uploaded_files()
63
+
64
+ def list_uploaded_files():
65
+ foldername = 'cache'
66
+ if not os.path.exists(foldername):
67
+ return []
68
+ files = os.listdir(foldername)
69
+ return [[file] for file in files]
70
+
71
+ def on_select(evt: gr.SelectData):
72
+ global selected
73
+ selected_value = evt.value
74
+ selected_index = evt.index
75
+ selected = selected_value
76
+ print(f"Selected value: {selected_value} at index: {selected_index}")
77
+
78
+ file_path = os.path.join("cache", selected_value) if selected_value else None
79
+ status_message = f"Selected: {selected_value}" if selected_value else "No file selected"
80
+
81
+ file_size = get_file_size(file_path) if file_path else ""
82
+ file_creation_time = get_file_creation_time(file_path) if file_path else ""
83
+
84
+ return file_path, status_message, file_size, file_creation_time
85
+
86
+ def get_file_size(file_path):
87
+ if file_path and os.path.exists(file_path):
88
+ size_bytes = os.path.getsize(file_path)
89
+ if size_bytes < 1024:
90
+ return f"{size_bytes} bytes"
91
+ elif size_bytes < 1024 * 1024:
92
+ return f"{size_bytes / 1024:.2f} KB"
93
+ else:
94
+ return f"{size_bytes / (1024 * 1024):.2f} MB"
95
+ return ""
96
+
97
+ def get_file_creation_time(file_path):
98
+ if file_path and os.path.exists(file_path):
99
+ creation_time = os.path.getctime(file_path)
100
+ return datetime.fromtimestamp(creation_time).strftime("%Y-%m-%d %H:%M:%S")
101
+ return ""
102
+
103
+ def delete_file():
104
+ global selected
105
+ if selected:
106
+ foldername = 'cache'
107
+ file_path = os.path.join(foldername, selected)
108
+ if os.path.exists(file_path):
109
+ os.remove(file_path)
110
+ return list_uploaded_files(), None, f"File {selected} deleted successfully", "", ""
111
+ else:
112
+ return list_uploaded_files(), None, f"File {selected} not found", "", ""
113
+ else:
114
+ return list_uploaded_files(), None, "No file selected for deletion", "", ""
115
+
116
+ def refresh_files():
117
+ return list_uploaded_files()
118
+
119
+ def display_file(evt: gr.SelectData, df):
120
+ file_path = os.path.join("cache", evt.value)
121
+ return file_path, file_path if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')) else None, f"Displaying: {evt.value}"
122
+
123
+ def render_to_database():
124
+ # This function is undefined as per your request
125
+ Arcana.main()
126
+
127
+
128
+ def rename_file(new_name):
129
+ global selected
130
+ if selected and new_name:
131
+ old_path = os.path.join('cache', selected)
132
+ new_path = os.path.join('cache', new_name+'.'+selected.split('.')[-1])
133
+ if os.path.exists(old_path):
134
+ os.rename(old_path, new_path)
135
+ selected = new_name
136
+ return list_uploaded_files(), f"File renamed to {new_name}", new_path, get_file_size(new_path), get_file_creation_time(new_path)
137
+ else:
138
+ return list_uploaded_files(), f"File {selected} not found", None, "", ""
139
+ return list_uploaded_files(), "No file selected or new name not provided", None, "", ""
140
+
141
+ # Create the Gradio interface for the chatbot
142
+ chatbot_interface = gr.ChatInterface(
143
+ chatbot_response,
144
+ chatbot=gr.Chatbot(height=400),
145
+ textbox=gr.Textbox(placeholder="Type your message here...", container=True, scale=100),
146
+ title="Review With Arcana",
147
+ description="ArcanaUI v0.7 - Chatbot",
148
+ theme="soft",
149
+ examples=[
150
+ "What is Hydrogen Bonding?",
151
+ "Tell me the difference between impulse and force.",
152
+ "Tell me a joke that Calculus students will know.",
153
+ "How should I review for the AP Biology Exam?",
154
+ "What kind of resources are available in PA and Indexademics?",
155
+ "What is the StandardCAS™ group?"
156
+ ],
157
+ cache_examples=False,
158
+ retry_btn=None,
159
+ undo_btn="Delete Previous",
160
+ clear_btn="Clear"
161
+ )
162
+
163
+ # Combine the interfaces using Tabs
164
+ with gr.Blocks() as demo:
165
+ gr.Markdown("# ArcanaUI v0.7")
166
+ with gr.Tabs():
167
+ with gr.TabItem("Welcome Page"):
168
+ gr.Markdown("""
169
+ hi
170
+ """)
171
+
172
+ with gr.TabItem("Chatbot"):
173
+ chatbot_interface.render()
174
+
175
+ # File uploading interface
176
+ with gr.TabItem('Upload'):
177
+ gr.Markdown('# Upload and View Files')
178
+
179
+ with gr.Row():
180
+ # Left column: File list and buttons
181
+ with gr.Column(scale=1):
182
+ uploaded_files_list = gr.DataFrame(headers=["Uploaded Files"], datatype="str", interactive=False)
183
+
184
+ with gr.Row():
185
+ upload_button = gr.UploadButton('Upload File')
186
+ refresh_button = gr.Button('Refresh')
187
+ delete_button = gr.Button('Delete Selected File')
188
+
189
+ # Right column: File viewer and Image viewer
190
+ with gr.Column(scale=1):
191
+ with gr.Tab("File Viewer"):
192
+ file_viewer = gr.File(label="File Restore")
193
+ file_status = gr.Textbox(label="File Status", interactive=False)
194
+ file_size = gr.Textbox(label="File Size", interactive=False)
195
+ file_creation_time = gr.Textbox(label="File Creation Time", interactive=False)
196
+
197
+ with gr.Row():
198
+ new_file_name = gr.Textbox(label="New File Name", placeholder="Enter new file name")
199
+ rename_button = gr.Button("Rename File")
200
+
201
+
202
+ with gr.Tab("Image Viewer"):
203
+ image_viewer = gr.Image(label="Image Viewer", type="filepath")
204
+
205
+ # Event handlers
206
+ refresh_button.click(fn=refresh_files, outputs=uploaded_files_list)
207
+ upload_button.upload(upload_file, inputs=upload_button, outputs=uploaded_files_list)
208
+ delete_button.click(fn=delete_file, outputs=[uploaded_files_list, file_viewer, file_status, file_size, file_creation_time])
209
+ uploaded_files_list.select(fn=display_file, inputs=uploaded_files_list, outputs=[file_viewer, image_viewer, file_status])
210
+ uploaded_files_list.select(fn=on_select, outputs=[file_viewer, file_status, file_size, file_creation_time])
211
+ rename_button.click(fn=rename_file,
212
+ inputs=new_file_name,
213
+ outputs=[uploaded_files_list, file_status, file_viewer, file_size, file_creation_time])
214
+
215
+ render_button = gr.Button("Render all PDFs to Database")
216
+ render_button.click(fn=render_to_database)
217
+
218
+ # Launch the interface
219
+ demo.launch(share=True)