ashishanand commited on
Commit
1a492a3
·
1 Parent(s): d7955f6

Removed chromadb directory from tracking

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ chromadb/
app.py CHANGED
@@ -13,7 +13,7 @@ from transformers import AutoTokenizer
13
  from rerankers import Reranker
14
  from transformers import GPT2TokenizerFast
15
  from groq import Groq
16
-
17
  import gradio as gr
18
 
19
  # Retrieve the API key from environment variables (Hugging Face Secrets)
@@ -23,7 +23,22 @@ groq_api_key = os.environ.get('GROQ_API_KEY')
23
  chat_client = Groq(api_key=groq_api_key)
24
  model = "llama-3.2-90b-text-preview"
25
 
26
- # Define your functions (same as before)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def preprocess_text(text):
28
  # ... (same as your original function)
29
  text = re.sub(r'\s+', ' ', text)
@@ -53,6 +68,49 @@ def call_Llama_api(query, context):
53
  response = chat_completion.choices[0].message.content
54
  return response
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def is_car_model_available(query, available_models):
57
  # ... (same as your original function)
58
  for model in available_models:
@@ -60,6 +118,16 @@ def is_car_model_available(query, available_models):
60
  return model
61
  return None
62
 
 
 
 
 
 
 
 
 
 
 
63
  def colbert_rerank(query=None, chunks=None):
64
  # ... (same as your original function)
65
  d = ranker.rank(query=query, docs=chunks)
@@ -111,27 +179,68 @@ def initialize():
111
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
112
  print(f"Using device: {device}")
113
 
 
 
114
  # Initialize embedding model
115
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
116
  model_name="all-MiniLM-L12-v2", device=device
117
  )
118
 
119
- # Load the persisted ChromaDB collection
120
  client = PersistentClient(path="./chromadb")
121
 
122
  # Get the collection
123
  collection_name = "car_manuals5"
124
- collection = client.get_collection(
125
- name=collection_name,
126
- embedding_function=embedding_function
127
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  # Set available car models
130
- available_car_models = ['TIAGO', 'Astor']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # Initialize the ranker
133
  ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')
134
-
135
  # Call initialize function
136
  initialize()
137
 
@@ -145,4 +254,5 @@ iface = gr.Interface(
145
  )
146
 
147
  if __name__ == "__main__":
148
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
 
13
  from rerankers import Reranker
14
  from transformers import GPT2TokenizerFast
15
  from groq import Groq
16
+ from chromadb import PersistentClient
17
  import gradio as gr
18
 
19
  # Retrieve the API key from environment variables (Hugging Face Secrets)
 
23
  chat_client = Groq(api_key=groq_api_key)
24
  model = "llama-3.2-90b-text-preview"
25
 
26
+
27
+ def parse_pdf(pdf_path):
28
+
29
+ texts = []
30
+ with pdfplumber.open(pdf_path) as pdf:
31
+ for page_num, page in enumerate(pdf.pages, start=1):
32
+ text = page.extract_text()
33
+ if text:
34
+ texts.append({
35
+ 'text': text,
36
+ 'metadata': {
37
+ 'page_number': page_num
38
+ }
39
+ })
40
+ return texts
41
+
42
  def preprocess_text(text):
43
  # ... (same as your original function)
44
  text = re.sub(r'\s+', ' ', text)
 
68
  response = chat_completion.choices[0].message.content
69
  return response
70
 
71
+
72
+ def chunk_texts(texts, max_tokens=500, overlap_tokens=50):
73
+ """
74
+ Splits texts into chunks based on paragraphs with overlap to preserve context.
75
+
76
+ """
77
+ chunks = []
78
+ for item in texts:
79
+ text = preprocess_text(item['text'])
80
+ if not text:
81
+ continue
82
+ metadata = item['metadata']
83
+ # Split text into paragraphs
84
+ paragraphs = text.split('\n\n')
85
+ current_chunk = ''
86
+ current_tokens = 0
87
+ for i, paragraph in enumerate(paragraphs):
88
+ paragraph = paragraph.strip()
89
+ if not paragraph:
90
+ continue
91
+ paragraph_tokens = len(tokenizer.encode(paragraph))
92
+ if current_tokens + paragraph_tokens <= max_tokens:
93
+ current_chunk += paragraph + '\n\n'
94
+ current_tokens += paragraph_tokens
95
+ else:
96
+ # Save the current chunk
97
+ chunk = {
98
+ 'text': current_chunk.strip(),
99
+ 'metadata': metadata
100
+ }
101
+ chunks.append(chunk)
102
+ # Start a new chunk with overlap
103
+ overlap_text = ' '.join(current_chunk.split()[-overlap_tokens:])
104
+ current_chunk = overlap_text + ' ' + paragraph + '\n\n'
105
+ current_tokens = len(tokenizer.encode(current_chunk))
106
+ if current_chunk:
107
+ chunk = {
108
+ 'text': current_chunk.strip(),
109
+ 'metadata': metadata
110
+ }
111
+ chunks.append(chunk)
112
+ return chunks
113
+
114
  def is_car_model_available(query, available_models):
115
  # ... (same as your original function)
116
  for model in available_models:
 
118
  return model
119
  return None
120
 
121
+ def extract_car_model(pdf_filename):
122
+
123
+ base_name = os.path.basename(pdf_filename)
124
+ match = re.search(r'manual_(.+)\.pdf', base_name)
125
+ if match:
126
+ model_name = match.group(1).replace('_', ' ').title()
127
+ return model_name
128
+ else:
129
+ return 'Unknown Model'
130
+
131
  def colbert_rerank(query=None, chunks=None):
132
  # ... (same as your original function)
133
  d = ranker.rank(query=query, docs=chunks)
 
179
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
180
  print(f"Using device: {device}")
181
 
182
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") # For token counting
183
+
184
  # Initialize embedding model
185
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
186
  model_name="all-MiniLM-L12-v2", device=device
187
  )
188
 
 
189
  client = PersistentClient(path="./chromadb")
190
 
191
  # Get the collection
192
  collection_name = "car_manuals5"
193
+
194
+ if collection_name in [col.name for col in client.list_collections()]:
195
+ collection = client.get_collection(
196
+ name=collection_name,
197
+ embedding_function=embedding_function
198
+ )
199
+ available_car_models = ['Tiago', 'Astor']
200
+
201
+ else:
202
+ collection = client.create_collection(
203
+ name=collection_name,
204
+ embedding_function=embedding_function
205
+ )
206
+
207
+
208
+ # collection = client.get_or_create_collection(
209
+ # name=collection_name,
210
+ # embedding_function=embedding_function
211
+ # )
212
 
213
  # Set available car models
214
+ # available_car_models = ['TIAGO', 'Astor']
215
+
216
+ pdf_files = ['./car_manuals/manual_Tiago.pdf', './car_manuals/manual_Astor.pdf']
217
+ available_car_models = []
218
+
219
+ for pdf_file in pdf_files:
220
+ print(f"Parsing {pdf_file}...")
221
+ pdf_texts = parse_pdf(pdf_file)
222
+ car_model = extract_car_model(pdf_file)
223
+ available_car_models.append(car_model)
224
+ # Add car model to metadata
225
+ for item in pdf_texts:
226
+ item['metadata']['car_model'] = car_model
227
+ # Chunk texts using the refined strategy
228
+ chunks = chunk_texts(pdf_texts, max_tokens=500, overlap_tokens=50)
229
+ # Prepare data for ChromaDB
230
+ documents = [chunk['text'] for chunk in chunks]
231
+ metadatas = [chunk['metadata'] for chunk in chunks]
232
+ ids = [f"{car_model}_{i}" for i in range(len(documents))]
233
+ # Add to ChromaDB collection
234
+ collection.add(
235
+ documents=documents,
236
+ metadatas=metadatas,
237
+ ids=ids
238
+ )
239
+
240
+
241
 
242
  # Initialize the ranker
243
  ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')
 
244
  # Call initialize function
245
  initialize()
246
 
 
254
  )
255
 
256
  if __name__ == "__main__":
257
+ # iface.launch(server_name="0.0.0.0", server_port=7860)
258
+ iface.launch()
chromadb/chroma.sqlite3 → car-manuals/manual_Astor.pdf RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:481a2f22b50f9edd260645533393020b100f9c8e43ba5393925af96c02af9a2f
3
- size 6451200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7275b9aae94841441d33ec596e65ffe2bd738f42a980ab1b53d26d35a725b73e
3
+ size 8105807
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/data_level0.bin → car-manuals/manual_Tiago.pdf RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf95cb4ad00dbb2be6ce91b7143b22b48c7583817cc57b4fc153791554a14132
3
- size 1676000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b71ee499e53973ccbabdd49b11995cc374bf9c543d372d4bc63ea8f7414cd7fa
3
+ size 2564414
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
- size 100
 
 
 
 
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e1219ac9d4a1a30d3d5f9f3dfc60df85e0844f2b73f04e8f641cc4a101a470
3
- size 4000
 
 
 
 
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/link_lists.bin DELETED
File without changes
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
- # requirements.txt
2
-
3
  gradio
4
  torch
5
  sentence_transformers
 
 
 
1
  gradio
2
  torch
3
  sentence_transformers