Spaces:
Runtime error
Runtime error
from flask import Flask, request, jsonify | |
import pandas as pd | |
import numpy as np | |
from elasticsearch import Elasticsearch | |
from scipy.spatial.distance import cosine | |
from sentence_transformers import SentenceTransformer | |
import logging | |
#Creat the flask instance Using create_app | |
app=Flask(__name__) | |
# Configure logging | |
#logging.basicConfig(filename='app.log', level=logging.INFO) | |
""" | |
Functions for request/response validation | |
""" | |
# Define a function for request validation | |
def validate_request(request_data): | |
# Example: Validate that 'question' is present in the request | |
if 'question' not in request_data: | |
return False | |
return True | |
# Define a function for response validation | |
def validate_response(response_data): | |
# Example: Validate that 'message' is present in the response | |
if 'message' not in response_data: | |
return False | |
return True | |
""" | |
Function for preparing csv for indexing | |
""" | |
def prepare_documents(df): | |
documents = [] | |
for _, row in df.iterrows(): | |
#row["Embedding"].tolist() | |
document = { | |
"Passages": row["Passages"], | |
"Metadata": row["Metadata"], | |
"Embedding": { | |
"type": "dense_vector", | |
"dims": 3, # Specify the dimensionality of your dense vectors | |
"value": row["Embedding"].tolist() | |
}} | |
documents.append(document) | |
return documents | |
""" | |
function for working with retrival responses | |
""" | |
# Extract relevant passages, metadata, and scores | |
def Extraction(response,question_embedding): | |
relevant_passages = [] | |
for hit in response["hits"]["hits"]: | |
passage = hit["_source"]["Passages"] | |
metadata = hit["_source"]["Metadata"] | |
#score_1=hit['_score'] | |
passage_embedding = np.array(hit["_source"]["Embedding"]['value']) | |
score = 1 - cosine(question_embedding, passage_embedding) # Calculate cosine similarity | |
relevant_passages.append({"passage": passage, "metadata": metadata, "score": score}) | |
#Sort the relevant passages by score in descending order | |
relevant_passages.sort(key=lambda x: x["score"], reverse=True) | |
#Get the top 3 relevant passages and their metadata | |
top_3_relevant_passages = relevant_passages[:3] | |
return top_3_relevant_passages | |
#create the elastic search instance | |
es = Elasticsearch( | |
"https://92d997736474439dae5ccfaedc2ad990.us-central1.gcp.cloud.es.io:443", | |
api_key="Ym16RzI0b0JIcXpRTU9NQUNUNE46YnBmaUtCWHdTNXlnN1dZR2w4Rllqdw==" | |
) | |
#app.logger.info(msg='es instance created') | |
""" | |
Question asking endpoint | |
""" | |
# Define an endpoint for receiving a user question via POST request | |
def receive_question(): | |
model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1') | |
# Get the question from the request JSON data | |
question_data = request.get_json() | |
user_question = question_data.get('question') | |
# Validate request data | |
if not validate_request(question_data): | |
#app.logger.error(msg='Invalid request data') | |
return jsonify({'error': 'Invalid request data'}), 400 | |
#return response | |
question = user_question | |
question_embedding = model.encode(question) | |
question_embedding=question_embedding.tolist() | |
#index name created on elasticsearch | |
index_name="search-passagemetadataemb" | |
#search | |
response = es.search( | |
index=index_name, | |
q=question, | |
size=3 | |
) | |
top_3=Extraction(response=response,question_embedding=question_embedding) | |
results={} | |
id=0 # id for different passages | |
for passage_info in top_3: | |
results[f"Passage {id}:"]=passage_info["passage"] | |
results[f"Metadata {id}:"]= passage_info["metadata"] | |
results[f"Score {id}:"]= passage_info["score"] | |
id=id+1 | |
# Respond with a confirmation message | |
response = {'message': 'Question received successfully', | |
'qustion': user_question, | |
'results': results | |
} | |
# Validate request data | |
if not validate_response(response): | |
return jsonify({'error': 'Invalid response data'}), 500 | |
return jsonify(response) | |
""" | |
File Upload endpoint | |
""" | |
def upload_document(): | |
# Get the uploaded file from the request | |
uploaded_file = request.files['file'] | |
if uploaded_file: | |
app.logger.info(msg='file uploaded') | |
# Process the uploaded file | |
# Here, we save it with a unique name | |
file_path = 'uploads/' + uploaded_file.filename | |
uploaded_file.save(file_path) | |
df=pd.read_csv(file_path) | |
#Convert embeddings to np array | |
df['Embedding'] = df['Embedding'].apply(lambda x: np.fromstring(x.replace('\n', '')[1:-1], sep=' ')) | |
# Index the document in Elasticsearch | |
documents=prepare_documents(df) | |
# Create a function to prepare documents for indexing | |
index_name = "search-passagemetadataemb" #index name created on elasticsearch | |
#index | |
for doc_id, document in enumerate(documents): | |
es.index(index=index_name, body=document, id=doc_id) | |
return jsonify({'message': 'Document uploaded and indexed successfully'}) | |
return jsonify({'message': 'No file uploaded'}) |