Update Arcana.py
Browse files
Arcana.py
CHANGED
@@ -21,7 +21,7 @@ def process_text_into_paragraphs(text):
|
|
21 |
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
22 |
|
23 |
# Split text into paragraphs
|
24 |
-
paragraphs = re.split(r'\n{2,}', text)
|
25 |
|
26 |
# Clean up each paragraph
|
27 |
cleaned_paragraphs = []
|
@@ -58,16 +58,17 @@ def process_pdfs(directory, db):
|
|
58 |
pbar.set_postfix({"Current File": filename})
|
59 |
print(filename)
|
60 |
|
61 |
-
def main():
|
62 |
-
db_filename =
|
63 |
|
64 |
if os.path.exists(db_filename):
|
|
|
65 |
print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
|
66 |
#db = ChatDatabase(db_filename)
|
67 |
#else:
|
68 |
print(f"Creating new database '{db_filename}'...")
|
69 |
db = ChatDatabase(db_filename)
|
70 |
-
pdf_directory =
|
71 |
|
72 |
start_time = time.time()
|
73 |
process_pdfs(pdf_directory, db)
|
|
|
21 |
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
22 |
|
23 |
# Split text into paragraphs
|
24 |
+
paragraphs = list(set(re.split(r'\n{2,}', text)))
|
25 |
|
26 |
# Clean up each paragraph
|
27 |
cleaned_paragraphs = []
|
|
|
58 |
pbar.set_postfix({"Current File": filename})
|
59 |
print(filename)
|
60 |
|
61 |
+
def main(foldername):
|
62 |
+
db_filename = foldername
|
63 |
|
64 |
if os.path.exists(db_filename):
|
65 |
+
db_filename += '.txt'
|
66 |
print(f"Database file '{db_filename}' already exists. Rewriting existing database...")
|
67 |
#db = ChatDatabase(db_filename)
|
68 |
#else:
|
69 |
print(f"Creating new database '{db_filename}'...")
|
70 |
db = ChatDatabase(db_filename)
|
71 |
+
pdf_directory = foldername
|
72 |
|
73 |
start_time = time.time()
|
74 |
process_pdfs(pdf_directory, db)
|