ashishanand commited on
Commit
a05b44b
·
1 Parent(s): 1a492a3

Removed pdf directory from tracking

Browse files
.gitattributes CHANGED
@@ -33,5 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
36
  chromadb/* filter=lfs diff=lfs merge=lfs -text
37
  car-manuals/* filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ chromadb/e820442b-1d6c-4933-8a2c-981f60377458 filter=lfs diff=lfs merge=lfs -text
38
  chromadb/* filter=lfs diff=lfs merge=lfs -text
39
  car-manuals/* filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,17 +1,12 @@
1
  # app.py
2
 
3
  import os
4
- import re
5
- import numpy as np
6
  import torch
7
- from sentence_transformers import SentenceTransformer
8
- import pdfplumber
9
- from chromadb import Client
10
- from chromadb.config import Settings
11
  from chromadb.utils import embedding_functions
12
- from transformers import AutoTokenizer
13
  from rerankers import Reranker
14
- from transformers import GPT2TokenizerFast
15
  from groq import Groq
16
  from chromadb import PersistentClient
17
  import gradio as gr
@@ -24,26 +19,26 @@ chat_client = Groq(api_key=groq_api_key)
24
  model = "llama-3.2-90b-text-preview"
25
 
26
 
27
- def parse_pdf(pdf_path):
28
 
29
- texts = []
30
- with pdfplumber.open(pdf_path) as pdf:
31
- for page_num, page in enumerate(pdf.pages, start=1):
32
- text = page.extract_text()
33
- if text:
34
- texts.append({
35
- 'text': text,
36
- 'metadata': {
37
- 'page_number': page_num
38
- }
39
- })
40
- return texts
41
 
42
- def preprocess_text(text):
43
- # ... (same as your original function)
44
- text = re.sub(r'\s+', ' ', text)
45
- text = text.strip()
46
- return text
47
 
48
  def call_Llama_api(query, context):
49
  # ... (same as your original function)
@@ -69,47 +64,48 @@ def call_Llama_api(query, context):
69
  return response
70
 
71
 
72
- def chunk_texts(texts, max_tokens=500, overlap_tokens=50):
73
- """
74
- Splits texts into chunks based on paragraphs with overlap to preserve context.
75
-
76
- """
77
- chunks = []
78
- for item in texts:
79
- text = preprocess_text(item['text'])
80
- if not text:
81
- continue
82
- metadata = item['metadata']
83
- # Split text into paragraphs
84
- paragraphs = text.split('\n\n')
85
- current_chunk = ''
86
- current_tokens = 0
87
- for i, paragraph in enumerate(paragraphs):
88
- paragraph = paragraph.strip()
89
- if not paragraph:
90
- continue
91
- paragraph_tokens = len(tokenizer.encode(paragraph))
92
- if current_tokens + paragraph_tokens <= max_tokens:
93
- current_chunk += paragraph + '\n\n'
94
- current_tokens += paragraph_tokens
95
- else:
96
- # Save the current chunk
97
- chunk = {
98
- 'text': current_chunk.strip(),
99
- 'metadata': metadata
100
- }
101
- chunks.append(chunk)
102
- # Start a new chunk with overlap
103
- overlap_text = ' '.join(current_chunk.split()[-overlap_tokens:])
104
- current_chunk = overlap_text + ' ' + paragraph + '\n\n'
105
- current_tokens = len(tokenizer.encode(current_chunk))
106
- if current_chunk:
107
- chunk = {
108
- 'text': current_chunk.strip(),
109
- 'metadata': metadata
110
- }
111
- chunks.append(chunk)
112
- return chunks
 
113
 
114
  def is_car_model_available(query, available_models):
115
  # ... (same as your original function)
@@ -118,15 +114,15 @@ def is_car_model_available(query, available_models):
118
  return model
119
  return None
120
 
121
- def extract_car_model(pdf_filename):
122
 
123
- base_name = os.path.basename(pdf_filename)
124
- match = re.search(r'manual_(.+)\.pdf', base_name)
125
- if match:
126
- model_name = match.group(1).replace('_', ' ').title()
127
- return model_name
128
- else:
129
- return 'Unknown Model'
130
 
131
  def colbert_rerank(query=None, chunks=None):
132
  # ... (same as your original function)
@@ -179,7 +175,7 @@ def initialize():
179
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
180
  print(f"Using device: {device}")
181
 
182
- tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") # For token counting
183
 
184
  # Initialize embedding model
185
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
@@ -191,18 +187,18 @@ def initialize():
191
  # Get the collection
192
  collection_name = "car_manuals5"
193
 
194
- if collection_name in [col.name for col in client.list_collections()]:
195
- collection = client.get_collection(
196
- name=collection_name,
197
- embedding_function=embedding_function
198
- )
199
- available_car_models = ['Tiago', 'Astor']
200
 
201
- else:
202
- collection = client.create_collection(
203
- name=collection_name,
204
- embedding_function=embedding_function
205
- )
206
 
207
 
208
  # collection = client.get_or_create_collection(
@@ -213,29 +209,29 @@ def initialize():
213
  # Set available car models
214
  # available_car_models = ['TIAGO', 'Astor']
215
 
216
- pdf_files = ['./car_manuals/manual_Tiago.pdf', './car_manuals/manual_Astor.pdf']
217
- available_car_models = []
218
-
219
- for pdf_file in pdf_files:
220
- print(f"Parsing {pdf_file}...")
221
- pdf_texts = parse_pdf(pdf_file)
222
- car_model = extract_car_model(pdf_file)
223
- available_car_models.append(car_model)
224
- # Add car model to metadata
225
- for item in pdf_texts:
226
- item['metadata']['car_model'] = car_model
227
- # Chunk texts using the refined strategy
228
- chunks = chunk_texts(pdf_texts, max_tokens=500, overlap_tokens=50)
229
- # Prepare data for ChromaDB
230
- documents = [chunk['text'] for chunk in chunks]
231
- metadatas = [chunk['metadata'] for chunk in chunks]
232
- ids = [f"{car_model}_{i}" for i in range(len(documents))]
233
- # Add to ChromaDB collection
234
- collection.add(
235
- documents=documents,
236
- metadatas=metadatas,
237
- ids=ids
238
- )
239
 
240
 
241
 
 
1
  # app.py
2
 
3
  import os
4
+ # import re
 
5
  import torch
6
+ # import pdfplumber
 
 
 
7
  from chromadb.utils import embedding_functions
 
8
  from rerankers import Reranker
9
+ # from transformers import GPT2TokenizerFast
10
  from groq import Groq
11
  from chromadb import PersistentClient
12
  import gradio as gr
 
19
  model = "llama-3.2-90b-text-preview"
20
 
21
 
22
+ # def parse_pdf(pdf_path):
23
 
24
+ # texts = []
25
+ # with pdfplumber.open(pdf_path) as pdf:
26
+ # for page_num, page in enumerate(pdf.pages, start=1):
27
+ # text = page.extract_text()
28
+ # if text:
29
+ # texts.append({
30
+ # 'text': text,
31
+ # 'metadata': {
32
+ # 'page_number': page_num
33
+ # }
34
+ # })
35
+ # return texts
36
 
37
+ # def preprocess_text(text):
38
+ # # ... (same as your original function)
39
+ # text = re.sub(r'\s+', ' ', text)
40
+ # text = text.strip()
41
+ # return text
42
 
43
  def call_Llama_api(query, context):
44
  # ... (same as your original function)
 
64
  return response
65
 
66
 
67
+ # def chunk_texts(texts, max_tokens=500, overlap_tokens=50):
68
+ # """
69
+ # Splits texts into chunks based on paragraphs with overlap to preserve context.
70
+
71
+ # """
72
+ # global tokenizer
73
+ # chunks = []
74
+ # for item in texts:
75
+ # text = preprocess_text(item['text'])
76
+ # if not text:
77
+ # continue
78
+ # metadata = item['metadata']
79
+ # # Split text into paragraphs
80
+ # paragraphs = text.split('\n\n')
81
+ # current_chunk = ''
82
+ # current_tokens = 0
83
+ # for i, paragraph in enumerate(paragraphs):
84
+ # paragraph = paragraph.strip()
85
+ # if not paragraph:
86
+ # continue
87
+ # paragraph_tokens = len(tokenizer.encode(paragraph))
88
+ # if current_tokens + paragraph_tokens <= max_tokens:
89
+ # current_chunk += paragraph + '\n\n'
90
+ # current_tokens += paragraph_tokens
91
+ # else:
92
+ # # Save the current chunk
93
+ # chunk = {
94
+ # 'text': current_chunk.strip(),
95
+ # 'metadata': metadata
96
+ # }
97
+ # chunks.append(chunk)
98
+ # # Start a new chunk with overlap
99
+ # overlap_text = ' '.join(current_chunk.split()[-overlap_tokens:])
100
+ # current_chunk = overlap_text + ' ' + paragraph + '\n\n'
101
+ # current_tokens = len(tokenizer.encode(current_chunk))
102
+ # if current_chunk:
103
+ # chunk = {
104
+ # 'text': current_chunk.strip(),
105
+ # 'metadata': metadata
106
+ # }
107
+ # chunks.append(chunk)
108
+ # return chunks
109
 
110
  def is_car_model_available(query, available_models):
111
  # ... (same as your original function)
 
114
  return model
115
  return None
116
 
117
+ # def extract_car_model(pdf_filename):
118
 
119
+ # base_name = os.path.basename(pdf_filename)
120
+ # match = re.search(r'manual_(.+)\.pdf', base_name)
121
+ # if match:
122
+ # model_name = match.group(1).replace('_', ' ').title()
123
+ # return model_name
124
+ # else:
125
+ # return 'Unknown Model'
126
 
127
  def colbert_rerank(query=None, chunks=None):
128
  # ... (same as your original function)
 
175
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
176
  print(f"Using device: {device}")
177
 
178
+ # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") # For token counting
179
 
180
  # Initialize embedding model
181
  embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
 
187
  # Get the collection
188
  collection_name = "car_manuals5"
189
 
190
+ # if collection_name in [col.name for col in client.list_collections()]:
191
+ # collection = client.get_collection(
192
+ # name=collection_name,
193
+ # embedding_function=embedding_function
194
+ # )
195
+ available_car_models = ['Tiago', 'Astor']
196
 
197
+ # else:
198
+ collection = client.get_collection(
199
+ name=collection_name,
200
+ embedding_function=embedding_function
201
+ )
202
 
203
 
204
  # collection = client.get_or_create_collection(
 
209
  # Set available car models
210
  # available_car_models = ['TIAGO', 'Astor']
211
 
212
+ # pdf_files = ['./car_manuals/manual_Tiago.pdf', './car_manuals/manual_Astor.pdf']
213
+ # available_car_models = []
214
+
215
+ # for pdf_file in pdf_files:
216
+ # print(f"Parsing {pdf_file}...")
217
+ # pdf_texts = parse_pdf(pdf_file)
218
+ # car_model = extract_car_model(pdf_file)
219
+ # available_car_models.append(car_model)
220
+ # # Add car model to metadata
221
+ # for item in pdf_texts:
222
+ # item['metadata']['car_model'] = car_model
223
+ # # Chunk texts using the refined strategy
224
+ # chunks = chunk_texts(pdf_texts, max_tokens=500, overlap_tokens=50)
225
+ # # Prepare data for ChromaDB
226
+ # documents = [chunk['text'] for chunk in chunks]
227
+ # metadatas = [chunk['metadata'] for chunk in chunks]
228
+ # ids = [f"{car_model}_{i}" for i in range(len(documents))]
229
+ # # Add to ChromaDB collection
230
+ # collection.add(
231
+ # documents=documents,
232
+ # metadatas=metadatas,
233
+ # ids=ids
234
+ # )
235
 
236
 
237
 
car-manuals/manual_Astor.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7275b9aae94841441d33ec596e65ffe2bd738f42a980ab1b53d26d35a725b73e
3
- size 8105807
 
 
 
 
car-manuals/manual_Tiago.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b71ee499e53973ccbabdd49b11995cc374bf9c543d372d4bc63ea8f7414cd7fa
3
- size 2564414