import os
import io
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
import re
from tqdm import tqdm
import time
from nylon import ChatDatabase, get_keywords
from docx import Document
from pptx import Presentation

def extract_text_from_pdf(pdf_path):
    output_string = io.StringIO()
    with open(pdf_path, 'rb') as fin:
        laparams = LAParams(line_margin=1.5, char_margin=2.0, word_margin=0.1)
        extract_text_to_fp(fin, output_string, laparams=laparams, 
                           output_type='text', codec='utf-8')
    return output_string.getvalue()

def extract_text_from_docx(docx_path):
    document = Document(docx_path)
    return '\n'.join([para.text for para in document.paragraphs])

def extract_text_from_pptx(pptx_path):
    presentation = Presentation(pptx_path)
    slides_text = []
    for slide in presentation.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                slides_text.append(shape.text)
    return '\n'.join(slides_text)

def process_text_into_paragraphs(text):
    # Remove page numbers and headers/footers
    text = re.sub(r'\n\d+\n', '\n', text)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Split text into paragraphs
    paragraphs = list(set(re.split(r'\n{2,}', text)))
    
    # Clean up each paragraph
    cleaned_paragraphs = []
    for para in paragraphs:
        # Remove extra whitespace and join broken words
        cleaned_para = re.sub(r'\s+', ' ', para).strip()
        cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para)
        cleaned_para = re.sub(r'(\w+)\s*\n\s*(\w+)', r'\1 \2', cleaned_para)
        if cleaned_para:  # Only add non-empty paragraphs
            cleaned_paragraphs.append(cleaned_para)
    
    return cleaned_paragraphs

def process_files(directory, db):
    fixed_timestamp = "2024-10-22 12:00:00"
    sender = "Arcana"  # Set sender to "Arcana" for all messages
    
    files = [f for f in os.listdir(directory) if f.endswith(('.pdf', '.docx', '.pptx'))]
    total_files = len(files)
    
    with tqdm(total=total_files, desc="Processing Files", unit="file") as pbar:
        for filename in files:
            file_path = os.path.join(directory, filename)
            tag = os.path.splitext(filename)[0]  # Use filename without extension as tag
            
            if filename.endswith('.pdf'):
                text = extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                text = extract_text_from_docx(file_path)
            elif filename.endswith('.pptx'):
                text = extract_text_from_pptx(file_path)
            else:
                continue  # Skip unsupported file types
            
            paragraphs = process_text_into_paragraphs(text)
            
            for paragraph in paragraphs:
                db.add_message(sender, fixed_timestamp, str(paragraph), tag)
            
            pbar.update(1)
            pbar.set_postfix({"Current File": filename})
            print(filename)

def main(foldername):
    db_filename = foldername
    
    if os.path.exists(db_filename):
        db_filename += '.txt'
        print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
    
    print(f"Creating new database '{db_filename}'...")
    db = ChatDatabase(db_filename)
    file_directory = foldername
    
    start_time = time.time()
    process_files(file_directory, db)
    end_time = time.time()
    
    total_time = end_time - start_time
    print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds")

if __name__ == "__main__":
    main()