Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -39,10 +39,37 @@ selected_model_name = models[0] # Default to the first model in the list
|
|
39 |
|
40 |
# Initialize the parser
|
41 |
parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# Embedding model and index initialization (to be populated by uploaded files)
|
45 |
-
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
|
46 |
|
47 |
# Global variable to store documents loaded from user-uploaded files
|
48 |
vector_index = None
|
|
|
39 |
|
40 |
# Initialize the parser
|
41 |
parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
|
42 |
+
# Define file extractor with various common extensions
|
43 |
+
file_extractor = {
|
44 |
+
'.pdf': parser, # PDF documents
|
45 |
+
'.docx': parser, # Microsoft Word documents
|
46 |
+
'.doc': parser, # Older Microsoft Word documents
|
47 |
+
'.txt': parser, # Plain text files
|
48 |
+
'.csv': parser, # Comma-separated values files
|
49 |
+
'.xlsx': parser, # Microsoft Excel files (requires additional processing for tables)
|
50 |
+
'.pptx': parser, # Microsoft PowerPoint files (for slides)
|
51 |
+
'.html': parser, # HTML files (web pages)
|
52 |
+
# '.rtf': parser, # Rich Text Format files
|
53 |
+
# '.odt': parser, # OpenDocument Text files
|
54 |
+
# '.epub': parser, # ePub files (e-books)
|
55 |
+
|
56 |
+
# Image files for OCR processing
|
57 |
+
'.jpg': parser, # JPEG images
|
58 |
+
'.jpeg': parser, # JPEG images
|
59 |
+
'.png': parser, # PNG images
|
60 |
+
# '.bmp': parser, # Bitmap images
|
61 |
+
# '.tiff': parser, # TIFF images
|
62 |
+
# '.tif': parser, # TIFF images (alternative extension)
|
63 |
+
# '.gif': parser, # GIF images (can contain text)
|
64 |
+
|
65 |
+
# Scanned documents in image formats
|
66 |
+
'.webp': parser, # WebP images
|
67 |
+
'.svg': parser, # SVG files (vector format, may contain embedded text)
|
68 |
+
}
|
69 |
|
70 |
# Embedding model and index initialization (to be populated by uploaded files)
|
71 |
+
# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
72 |
+
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
73 |
|
74 |
# Global variable to store documents loaded from user-uploaded files
|
75 |
vector_index = None
|