Sarath0x8f commited on
Commit
90a2d71
·
verified ·
1 Parent(s): 3e21c23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -2
app.py CHANGED
@@ -39,10 +39,37 @@ selected_model_name = models[0] # Default to the first model in the list
39
 
40
  # Initialize the parser
41
  parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
42
- file_extractor = {'.pdf': parser, '.docx': parser, '.doc': parser}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Embedding model and index initialization (to be populated by uploaded files)
45
- embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 
46
 
47
  # Global variable to store documents loaded from user-uploaded files
48
  vector_index = None
 
39
 
40
  # Initialize the parser
41
  parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
42
+ # Define file extractor with various common extensions
43
+ file_extractor = {
44
+ '.pdf': parser, # PDF documents
45
+ '.docx': parser, # Microsoft Word documents
46
+ '.doc': parser, # Older Microsoft Word documents
47
+ '.txt': parser, # Plain text files
48
+ '.csv': parser, # Comma-separated values files
49
+ '.xlsx': parser, # Microsoft Excel files (requires additional processing for tables)
50
+ '.pptx': parser, # Microsoft PowerPoint files (for slides)
51
+ '.html': parser, # HTML files (web pages)
52
+ # '.rtf': parser, # Rich Text Format files
53
+ # '.odt': parser, # OpenDocument Text files
54
+ # '.epub': parser, # ePub files (e-books)
55
+
56
+ # Image files for OCR processing
57
+ '.jpg': parser, # JPEG images
58
+ '.jpeg': parser, # JPEG images
59
+ '.png': parser, # PNG images
60
+ # '.bmp': parser, # Bitmap images
61
+ # '.tiff': parser, # TIFF images
62
+ # '.tif': parser, # TIFF images (alternative extension)
63
+ # '.gif': parser, # GIF images (can contain text)
64
+
65
+ # Scanned documents in image formats
66
+ '.webp': parser, # WebP images
67
+ '.svg': parser, # SVG files (vector format, may contain embedded text)
68
+ }
69
 
70
  # Embedding model and index initialization (to be populated by uploaded files)
71
+ # embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
72
+ embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
73
 
74
  # Global variable to store documents loaded from user-uploaded files
75
  vector_index = None