Spaces:

ashishanand
/

car_manual_assistant

Sleeping

App Files Files Community

ashishanand commited on Nov 17, 2024

Commit

a05b44b

1 Parent(s): 1a492a3

Removed pdf directory from tracking

Browse files

Files changed (4) hide show

.gitattributes +2 -0
app.py +106 -110
car-manuals/manual_Astor.pdf +0 -3
car-manuals/manual_Tiago.pdf +0 -3

.gitattributes CHANGED Viewed

@@ -33,5 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 chromadb/* filter=lfs diff=lfs merge=lfs -text
 car-manuals/* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+chromadb/e820442b-1d6c-4933-8a2c-981f60377458 filter=lfs diff=lfs merge=lfs -text
 chromadb/* filter=lfs diff=lfs merge=lfs -text
 car-manuals/* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,17 +1,12 @@
 # app.py
 import os
-import re
-import numpy as np
 import torch
-from sentence_transformers import SentenceTransformer
-import pdfplumber
-from chromadb import Client
-from chromadb.config import Settings
 from chromadb.utils import embedding_functions
-from transformers import AutoTokenizer
 from rerankers import Reranker
-from transformers import GPT2TokenizerFast
 from groq import Groq
 from chromadb import PersistentClient
 import gradio as gr
@@ -24,26 +19,26 @@ chat_client = Groq(api_key=groq_api_key)
 model = "llama-3.2-90b-text-preview"
-def parse_pdf(pdf_path):
-    texts = []
-    with pdfplumber.open(pdf_path) as pdf:
-        for page_num, page in enumerate(pdf.pages, start=1):
-            text = page.extract_text()
-            if text:
-                texts.append({
-                    'text': text,
-                    'metadata': {
-                        'page_number': page_num
-                    }
-                })
-    return texts
-def preprocess_text(text):
-    # ... (same as your original function)
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    return text
 def call_Llama_api(query, context):
     # ... (same as your original function)
@@ -69,47 +64,48 @@ def call_Llama_api(query, context):
     return response
-def chunk_texts(texts, max_tokens=500, overlap_tokens=50):
-    """
-    Splits texts into chunks based on paragraphs with overlap to preserve context.
-    """
-    chunks = []
-    for item in texts:
-        text = preprocess_text(item['text'])
-        if not text:
-            continue
-        metadata = item['metadata']
-        # Split text into paragraphs
-        paragraphs = text.split('\n\n')
-        current_chunk = ''
-        current_tokens = 0
-        for i, paragraph in enumerate(paragraphs):
-            paragraph = paragraph.strip()
-            if not paragraph:
-                continue
-            paragraph_tokens = len(tokenizer.encode(paragraph))
-            if current_tokens + paragraph_tokens <= max_tokens:
-                current_chunk += paragraph + '\n\n'
-                current_tokens += paragraph_tokens
-            else:
-                # Save the current chunk
-                chunk = {
-                    'text': current_chunk.strip(),
-                    'metadata': metadata
-                }
-                chunks.append(chunk)
-                # Start a new chunk with overlap
-                overlap_text = ' '.join(current_chunk.split()[-overlap_tokens:])
-                current_chunk = overlap_text + ' ' + paragraph + '\n\n'
-                current_tokens = len(tokenizer.encode(current_chunk))
-        if current_chunk:
-            chunk = {
-                'text': current_chunk.strip(),
-                'metadata': metadata
-            }
-            chunks.append(chunk)
-    return chunks
 def is_car_model_available(query, available_models):
     # ... (same as your original function)
@@ -118,15 +114,15 @@ def is_car_model_available(query, available_models):
             return model
     return None
-def extract_car_model(pdf_filename):
-    base_name = os.path.basename(pdf_filename)
-    match = re.search(r'manual_(.+)\.pdf', base_name)
-    if match:
-        model_name = match.group(1).replace('_', ' ').title()
-        return model_name
-    else:
-        return 'Unknown Model'
 def colbert_rerank(query=None, chunks=None):
     # ... (same as your original function)
@@ -179,7 +175,7 @@ def initialize():
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"Using device: {device}")
-    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")  # For token counting
     # Initialize embedding model
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
@@ -191,18 +187,18 @@ def initialize():
     # Get the collection
     collection_name = "car_manuals5"
-    if collection_name in [col.name for col in client.list_collections()]:
-        collection = client.get_collection(
-            name=collection_name,
-            embedding_function=embedding_function
-        )
-        available_car_models = ['Tiago', 'Astor']
-    else:
-        collection = client.create_collection(
-            name=collection_name,
-            embedding_function=embedding_function
-        )
     # collection = client.get_or_create_collection(
@@ -213,29 +209,29 @@ def initialize():
     # Set available car models
     # available_car_models = ['TIAGO', 'Astor']
-        pdf_files = ['./car_manuals/manual_Tiago.pdf', './car_manuals/manual_Astor.pdf']
-        available_car_models = []
-        for pdf_file in pdf_files:
-            print(f"Parsing {pdf_file}...")
-            pdf_texts = parse_pdf(pdf_file)
-            car_model = extract_car_model(pdf_file)
-            available_car_models.append(car_model)
-            # Add car model to metadata
-            for item in pdf_texts:
-                item['metadata']['car_model'] = car_model
-            # Chunk texts using the refined strategy
-            chunks = chunk_texts(pdf_texts, max_tokens=500, overlap_tokens=50)
-            # Prepare data for ChromaDB
-            documents = [chunk['text'] for chunk in chunks]
-            metadatas = [chunk['metadata'] for chunk in chunks]
-            ids = [f"{car_model}_{i}" for i in range(len(documents))]
-            # Add to ChromaDB collection
-            collection.add(
-                documents=documents,
-                metadatas=metadatas,
-                ids=ids
-            )

 # app.py
 import os
+# import re
 import torch
+# import pdfplumber
 from chromadb.utils import embedding_functions
 from rerankers import Reranker
+# from transformers import GPT2TokenizerFast
 from groq import Groq
 from chromadb import PersistentClient
 import gradio as gr
 model = "llama-3.2-90b-text-preview"
+# def parse_pdf(pdf_path):
+#     texts = []
+#     with pdfplumber.open(pdf_path) as pdf:
+#         for page_num, page in enumerate(pdf.pages, start=1):
+#             text = page.extract_text()
+#             if text:
+#                 texts.append({
+#                     'text': text,
+#                     'metadata': {
+#                         'page_number': page_num
+#                     }
+#                 })
+#     return texts
+# def preprocess_text(text):
+#     # ... (same as your original function)
+#     text = re.sub(r'\s+', ' ', text)
+#     text = text.strip()
+#     return text
 def call_Llama_api(query, context):
     # ... (same as your original function)
     return response
+# def chunk_texts(texts, max_tokens=500, overlap_tokens=50):
+#     """
+#     Splits texts into chunks based on paragraphs with overlap to preserve context.
+#     """
+#     global tokenizer
+#     chunks = []
+#     for item in texts:
+#         text = preprocess_text(item['text'])
+#         if not text:
+#             continue
+#         metadata = item['metadata']
+#         # Split text into paragraphs
+#         paragraphs = text.split('\n\n')
+#         current_chunk = ''
+#         current_tokens = 0
+#         for i, paragraph in enumerate(paragraphs):
+#             paragraph = paragraph.strip()
+#             if not paragraph:
+#                 continue
+#             paragraph_tokens = len(tokenizer.encode(paragraph))
+#             if current_tokens + paragraph_tokens <= max_tokens:
+#                 current_chunk += paragraph + '\n\n'
+#                 current_tokens += paragraph_tokens
+#             else:
+#                 # Save the current chunk
+#                 chunk = {
+#                     'text': current_chunk.strip(),
+#                     'metadata': metadata
+#                 }
+#                 chunks.append(chunk)
+#                 # Start a new chunk with overlap
+#                 overlap_text = ' '.join(current_chunk.split()[-overlap_tokens:])
+#                 current_chunk = overlap_text + ' ' + paragraph + '\n\n'
+#                 current_tokens = len(tokenizer.encode(current_chunk))
+#         if current_chunk:
+#             chunk = {
+#                 'text': current_chunk.strip(),
+#                 'metadata': metadata
+#             }
+#             chunks.append(chunk)
+#     return chunks
 def is_car_model_available(query, available_models):
     # ... (same as your original function)
             return model
     return None
+# def extract_car_model(pdf_filename):
+#     base_name = os.path.basename(pdf_filename)
+#     match = re.search(r'manual_(.+)\.pdf', base_name)
+#     if match:
+#         model_name = match.group(1).replace('_', ' ').title()
+#         return model_name
+#     else:
+#         return 'Unknown Model'
 def colbert_rerank(query=None, chunks=None):
     # ... (same as your original function)
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"Using device: {device}")
+    # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")  # For token counting
     # Initialize embedding model
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
     # Get the collection
     collection_name = "car_manuals5"
+    # if collection_name in [col.name for col in client.list_collections()]:
+    #     collection = client.get_collection(
+    #         name=collection_name,
+    #         embedding_function=embedding_function
+    #     )
+    available_car_models = ['Tiago', 'Astor']
+    # else:
+    collection = client.get_collection(
+        name=collection_name,
+        embedding_function=embedding_function
+    )
     # collection = client.get_or_create_collection(
     # Set available car models
     # available_car_models = ['TIAGO', 'Astor']
+        # pdf_files = ['./car_manuals/manual_Tiago.pdf', './car_manuals/manual_Astor.pdf']
+        # available_car_models = []
+        # for pdf_file in pdf_files:
+        #     print(f"Parsing {pdf_file}...")
+        #     pdf_texts = parse_pdf(pdf_file)
+        #     car_model = extract_car_model(pdf_file)
+        #     available_car_models.append(car_model)
+        #     # Add car model to metadata
+        #     for item in pdf_texts:
+        #         item['metadata']['car_model'] = car_model
+        #     # Chunk texts using the refined strategy
+        #     chunks = chunk_texts(pdf_texts, max_tokens=500, overlap_tokens=50)
+        #     # Prepare data for ChromaDB
+        #     documents = [chunk['text'] for chunk in chunks]
+        #     metadatas = [chunk['metadata'] for chunk in chunks]
+        #     ids = [f"{car_model}_{i}" for i in range(len(documents))]
+        #     # Add to ChromaDB collection
+        #     collection.add(
+        #         documents=documents,
+        #         metadatas=metadatas,
+        #         ids=ids
+        #     )

car-manuals/manual_Astor.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7275b9aae94841441d33ec596e65ffe2bd738f42a980ab1b53d26d35a725b73e
-size 8105807

car-manuals/manual_Tiago.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b71ee499e53973ccbabdd49b11995cc374bf9c543d372d4bc63ea8f7414cd7fa
-size 2564414