|
import os |
|
import io |
|
from pdfminer.high_level import extract_text_to_fp |
|
from pdfminer.layout import LAParams |
|
import re |
|
from tqdm import tqdm |
|
import time |
|
from nylon import ChatDatabase, get_keywords |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
output_string = io.StringIO() |
|
with open(pdf_path, 'rb') as fin: |
|
laparams = LAParams(line_margin=1.5, char_margin=2.0, word_margin=0.1) |
|
extract_text_to_fp(fin, output_string, laparams=laparams, |
|
output_type='text', codec='utf-8') |
|
return output_string.getvalue() |
|
|
|
def process_text_into_paragraphs(text): |
|
|
|
text = re.sub(r'\n\d+\n', '\n', text) |
|
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) |
|
|
|
|
|
paragraphs = re.split(r'\n{2,}', text) |
|
|
|
|
|
cleaned_paragraphs = [] |
|
for para in paragraphs: |
|
|
|
cleaned_para = re.sub(r'\s+', ' ', para).strip() |
|
cleaned_para = re.sub(r'(\w+)-\s*(\w+)', r'\1\2', cleaned_para) |
|
cleaned_para = re.sub(r'(\w+)\s*\n\s*(\w+)', r'\1 \2', cleaned_para) |
|
if cleaned_para: |
|
cleaned_paragraphs.append(cleaned_para) |
|
|
|
return cleaned_paragraphs |
|
|
|
def process_pdfs(directory, db): |
|
fixed_timestamp = "2024-10-22 12:00:00" |
|
sender = "Arcana" |
|
|
|
pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')] |
|
total_files = len(pdf_files) |
|
|
|
with tqdm(total=total_files, desc="Processing PDFs", unit="file") as pbar: |
|
for filename in pdf_files: |
|
pdf_path = os.path.join(directory, filename) |
|
tag = os.path.splitext(filename)[0] |
|
|
|
text = extract_text_from_pdf(pdf_path) |
|
paragraphs = process_text_into_paragraphs(text) |
|
|
|
for paragraph in paragraphs: |
|
print(paragraph) |
|
db.add_message(sender, fixed_timestamp, str(paragraph), tag) |
|
|
|
pbar.update(1) |
|
pbar.set_postfix({"Current File": filename}) |
|
|
|
def main(): |
|
db_filename = 'memory.txt' |
|
|
|
if os.path.exists(db_filename): |
|
print(f"Database file '{db_filename}' already exists. Loading existing database...") |
|
db = ChatDatabase(db_filename) |
|
else: |
|
print(f"Creating new database '{db_filename}'...") |
|
db = ChatDatabase(db_filename) |
|
pdf_directory = 'cache' |
|
|
|
start_time = time.time() |
|
process_pdfs(pdf_directory, db) |
|
end_time = time.time() |
|
|
|
total_time = end_time - start_time |
|
print(f"\nDatabase creation complete. Total time: {total_time:.2f} seconds") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|