Spaces:

Ocillus
/

Arcana

Running

App Files Files Community

Ocillus commited on Jul 18, 2024

Commit

b790fc2

verified ·

1 Parent(s): 639dede

Upload 2 files

Browse files

Files changed (2) hide show

Arcana.py +12 -20
ArcanaUI2.py +219 -0

Arcana.py CHANGED Viewed

@@ -10,13 +10,18 @@ from nylon import ChatDatabase, get_keywords
 def extract_text_from_pdf(pdf_path):
     output_string = io.StringIO()
     with open(pdf_path, 'rb') as fin:
-        extract_text_to_fp(fin, output_string, laparams=LAParams(),
                            output_type='text', codec='utf-8')
     return output_string.getvalue()
 def process_text_into_paragraphs(text):
     # Split text into paragraphs
-    paragraphs = re.split(r'\n\s*\n', text)
     # Clean up each paragraph
     cleaned_paragraphs = []
@@ -24,6 +29,7 @@ def process_text_into_paragraphs(text):
         # Remove extra whitespace and join broken words
         cleaned_para = re.sub(r'\s+', ' ', para).strip()
         cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
         if cleaned_para:  # Only add non-empty paragraphs
             cleaned_paragraphs.append(cleaned_para)
@@ -45,13 +51,14 @@ def process_pdfs(directory, db):
             paragraphs = process_text_into_paragraphs(text)
             for paragraph in paragraphs:
-                db.add_message(sender, fixed_timestamp, paragraph, tag)
             pbar.update(1)
             pbar.set_postfix({"Current File": filename})
 def main():
-    db_filename = 'textbooks.txt'
     if os.path.exists(db_filename):
         print(f"Database file '{db_filename}' already exists. Loading existing database...")
@@ -59,7 +66,7 @@ def main():
     else:
         print(f"Creating new database '{db_filename}'...")
         db = ChatDatabase(db_filename)
-        pdf_directory = 'pdfdemos'
         start_time = time.time()
         process_pdfs(pdf_directory, db)
@@ -68,20 +75,5 @@ def main():
         total_time = end_time - start_time
         print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
-    # Example query
-    query = "NaCl"
-    sender = "Arcana"  # Now all senders are "Arcana"
-    N = 5
-    cache = {}
-    query_tag = "Chemistry2e-WEB"  # Use the PDF name as the tag for querying
-    relevant_messages = db.get_relevant_messages(sender, query, N, cache, query_tag)
-    print(f"\nTop {N} relevant paragraphs for query '{query}' with tag '{query_tag}':")
-    for message in relevant_messages:
-        print(f"From {message[0]} at {message[1]}:")
-        print(f"Tag: {message[3]}")
-        print(message[2][:200] + "...\n")
 if __name__ == "__main__":
     main()

 def extract_text_from_pdf(pdf_path):
     output_string = io.StringIO()
     with open(pdf_path, 'rb') as fin:
+        laparams = LAParams(line_margin=1.5, char_margin=2.0, word_margin=0.1)
+        extract_text_to_fp(fin, output_string, laparams=laparams,
                            output_type='text', codec='utf-8')
     return output_string.getvalue()
 def process_text_into_paragraphs(text):
+    # Remove page numbers and headers/footers
+    text = re.sub(r'\n\d+\n', '\n', text)
+    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
     # Split text into paragraphs
+    paragraphs = re.split(r'\n{2,}', text)
     # Clean up each paragraph
     cleaned_paragraphs = []
         # Remove extra whitespace and join broken words
         cleaned_para = re.sub(r'\s+', ' ', para).strip()
         cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
+        cleaned_para = re.sub(r'(\w+)\s*\n\s*(\w+)', r'\1 \2', cleaned_para)
         if cleaned_para:  # Only add non-empty paragraphs
             cleaned_paragraphs.append(cleaned_para)
             paragraphs = process_text_into_paragraphs(text)
             for paragraph in paragraphs:
+                print(paragraph)
+                db.add_message(sender, fixed_timestamp, str(paragraph), tag)
             pbar.update(1)
             pbar.set_postfix({"Current File": filename})
 def main():
+    db_filename = 'memory.txt'
     if os.path.exists(db_filename):
         print(f"Database file '{db_filename}' already exists. Loading existing database...")
     else:
         print(f"Creating new database '{db_filename}'...")
         db = ChatDatabase(db_filename)
+        pdf_directory = 'cache'
         start_time = time.time()
         process_pdfs(pdf_directory, db)
         total_time = end_time - start_time
         print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")
 if __name__ == "__main__":
     main()

ArcanaUI2.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import gradio as gr
+import ssl
+from openai import OpenAI
+import time
+import os
+import shutil
+from datetime import datetime
+import Arcana
+# SSL configuration to avoid verification issues
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+# OpenAI client setup
+client = OpenAI(
+    base_url='https://api.openai-proxy.org/v1',
+    api_key='sk-Nxf8HmLpfIMhCd83n3TOr00TR57uBZ0jMbAgGCOzppXvlsx1',
+)
+# Retry logic for OpenAI API call
+def openai_api_call(messages, retries=3, delay=5):
+    for attempt in range(retries):
+        try:
+            completion = client.chat.completions.create(
+                model="gpt-4o",
+                messages=messages,
+                timeout=10
+            )
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"Attempt {attempt + 1} failed: {e}")
+            time.sleep(delay)
+    return "Sorry, I am having trouble connecting to the server. Please try again later."
+# Chatbot response function
+def chatbot_response(message, history):
+    messages = [{"role": "system", "content": '''You are Arcana, a dynamic study resource database designed to help students excel in their exams. Your responses should be accurate, informative, and evidence-based whenever possible. Follow these guidelines:
+Your primary goal is to provide students with the most helpful and accurate study information, utilizing both your internal knowledge and the PDF resources at your disposal.'''}]
+    for human, assistant in history:
+        messages.append({"role": "user", "content": human})
+        messages.append({"role": "assistant", "content": assistant})
+    messages.append({"role": "user", "content": message})
+    response = openai_api_call(messages)
+    return response
+selected = None
+def upload_file(file):
+    foldername = 'cache'
+    if not os.path.exists(foldername):
+        os.mkdir(foldername)
+    file_path = os.path.join(foldername, os.path.basename(file.name))
+    shutil.copy(file.name, file_path)
+    return list_uploaded_files()
+def list_uploaded_files():
+    foldername = 'cache'
+    if not os.path.exists(foldername):
+        return []
+    files = os.listdir(foldername)
+    return [[file] for file in files]
+def on_select(evt: gr.SelectData):
+    global selected
+    selected_value = evt.value
+    selected_index = evt.index
+    selected = selected_value
+    print(f"Selected value: {selected_value} at index: {selected_index}")
+    file_path = os.path.join("cache", selected_value) if selected_value else None
+    status_message = f"Selected: {selected_value}" if selected_value else "No file selected"
+    file_size = get_file_size(file_path) if file_path else ""
+    file_creation_time = get_file_creation_time(file_path) if file_path else ""
+    return file_path, status_message, file_size, file_creation_time
+def get_file_size(file_path):
+    if file_path and os.path.exists(file_path):
+        size_bytes = os.path.getsize(file_path)
+        if size_bytes < 1024:
+            return f"{size_bytes} bytes"
+        elif size_bytes < 1024 * 1024:
+            return f"{size_bytes / 1024:.2f} KB"
+        else:
+            return f"{size_bytes / (1024 * 1024):.2f} MB"
+    return ""
+def get_file_creation_time(file_path):
+    if file_path and os.path.exists(file_path):
+        creation_time = os.path.getctime(file_path)
+        return datetime.fromtimestamp(creation_time).strftime("%Y-%m-%d %H:%M:%S")
+    return ""
+def delete_file():
+    global selected
+    if selected:
+        foldername = 'cache'
+        file_path = os.path.join(foldername, selected)
+        if os.path.exists(file_path):
+            os.remove(file_path)
+            return list_uploaded_files(), None, f"File {selected} deleted successfully", "", ""
+        else:
+            return list_uploaded_files(), None, f"File {selected} not found", "", ""
+    else:
+        return list_uploaded_files(), None, "No file selected for deletion", "", ""
+def refresh_files():
+    return list_uploaded_files()
+def display_file(evt: gr.SelectData, df):
+    file_path = os.path.join("cache", evt.value)
+    return file_path, file_path if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')) else None, f"Displaying: {evt.value}"
+def render_to_database():
+    # This function is undefined as per your request
+    Arcana.main()
+def rename_file(new_name):
+    global selected
+    if selected and new_name:
+        old_path = os.path.join('cache', selected)
+        new_path = os.path.join('cache', new_name+'.'+selected.split('.')[-1])
+        if os.path.exists(old_path):
+            os.rename(old_path, new_path)
+            selected = new_name
+            return list_uploaded_files(), f"File renamed to {new_name}", new_path, get_file_size(new_path), get_file_creation_time(new_path)
+        else:
+            return list_uploaded_files(), f"File {selected} not found", None, "", ""
+    return list_uploaded_files(), "No file selected or new name not provided", None, "", ""
+# Create the Gradio interface for the chatbot
+chatbot_interface = gr.ChatInterface(
+    chatbot_response,
+    chatbot=gr.Chatbot(height=400),
+    textbox=gr.Textbox(placeholder="Type your message here...", container=True, scale=100),
+    title="Review With Arcana",
+    description="ArcanaUI v0.7 - Chatbot",
+    theme="soft",
+    examples=[
+        "What is Hydrogen Bonding?",
+        "Tell me the difference between impulse and force.",
+        "Tell me a joke that Calculus students will know.",
+        "How should I review for the AP Biology Exam?",
+        "What kind of resources are available in PA and Indexademics?",
+        "What is the StandardCAS™ group?"
+    ],
+    cache_examples=False,
+    retry_btn=None,
+    undo_btn="Delete Previous",
+    clear_btn="Clear"
+)
+# Combine the interfaces using Tabs
+with gr.Blocks() as demo:
+    gr.Markdown("# ArcanaUI v0.7")
+    with gr.Tabs():
+        with gr.TabItem("Welcome Page"):
+            gr.Markdown("""
+            hi
+            """)
+        with gr.TabItem("Chatbot"):
+            chatbot_interface.render()
+        # File uploading interface
+        with gr.TabItem('Upload'):
+            gr.Markdown('# Upload and View Files')
+            with gr.Row():
+                # Left column: File list and buttons
+                with gr.Column(scale=1):
+                    uploaded_files_list = gr.DataFrame(headers=["Uploaded Files"], datatype="str", interactive=False)
+                    with gr.Row():
+                        upload_button = gr.UploadButton('Upload File')
+                        refresh_button = gr.Button('Refresh')
+                        delete_button = gr.Button('Delete Selected File')
+                # Right column: File viewer and Image viewer
+                with gr.Column(scale=1):
+                    with gr.Tab("File  Viewer"):
+                        file_viewer = gr.File(label="File Restore")
+                        file_status = gr.Textbox(label="File Status", interactive=False)
+                        file_size = gr.Textbox(label="File Size", interactive=False)
+                        file_creation_time = gr.Textbox(label="File Creation Time", interactive=False)
+                        with gr.Row():
+                            new_file_name = gr.Textbox(label="New File Name", placeholder="Enter new file name")
+                            rename_button = gr.Button("Rename File")
+                    with gr.Tab("Image Viewer"):
+                        image_viewer = gr.Image(label="Image Viewer", type="filepath")
+            # Event handlers
+            refresh_button.click(fn=refresh_files, outputs=uploaded_files_list)
+            upload_button.upload(upload_file, inputs=upload_button, outputs=uploaded_files_list)
+            delete_button.click(fn=delete_file, outputs=[uploaded_files_list, file_viewer, file_status, file_size, file_creation_time])
+            uploaded_files_list.select(fn=display_file, inputs=uploaded_files_list, outputs=[file_viewer, image_viewer, file_status])
+            uploaded_files_list.select(fn=on_select, outputs=[file_viewer, file_status, file_size, file_creation_time])
+            rename_button.click(fn=rename_file,
+                                inputs=new_file_name,
+                                outputs=[uploaded_files_list, file_status, file_viewer, file_size, file_creation_time])
+            render_button = gr.Button("Render all PDFs to Database")
+            render_button.click(fn=render_to_database)
+# Launch the interface
+demo.launch(share=True)