Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pymongo | |
import requests | |
import time | |
from dotenv import load_dotenv | |
import os | |
load_dotenv() | |
# file_path = '/content/free_courses.json' | |
uri = os.getenv("URI") | |
hf_token = os.getenv("HF_TOKEN") | |
client = pymongo.MongoClient(uri) | |
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2" | |
try: | |
db = client.av_courses | |
collection = db.courses | |
# Just trying to fetch the first document (you can adjust the query) | |
sample_doc = collection.find_one() | |
except Exception as e: | |
print(f"Error accessing the database or collection: {e}") | |
# def generate_embedding(text: str) -> list[float]: | |
# response = requests.post( | |
# embedding_url, | |
# headers={"Authorization": f"Bearer {hf_token}"}, | |
# json={"inputs": text}) | |
# if response.status_code != 200: | |
# raise ValueError(f"Request failed with status code {response.status_code}: {response.text}") | |
# return response.json() | |
def generate_embedding(text: str) -> list[float]: | |
for attempt in range(5): # Retry up to 5 times | |
response = requests.post( | |
embedding_url, | |
headers={"Authorization": f"Bearer {hf_token}"}, | |
json={"inputs": text} | |
) | |
if response.status_code == 200: | |
return response.json() | |
elif response.status_code == 503: | |
time.sleep(5) # Wait before retrying | |
else: | |
raise ValueError(f"Request failed with status code {response.status_code}: {response.text}") | |
raise ValueError("Failed to generate embedding after multiple retries.") | |
# for doc in collection.find({'title':{"$exists": True}}).limit(75): | |
# doc['course_embedding_hf'] = generate_embedding(doc['merged_summary']) | |
# collection.replace_one({'_id': doc['_id']}, doc) | |
# print(doc['_id'], end=" ") | |
# db.avcourses.create_index( | |
# [("course_embedding_hf", pymongo.GEOSPHERE)], | |
# name="CourseSemanticSearch" | |
# ) | |
def getSearchResults(query: str, relevance_threshold: float) -> list[object]: | |
query_embedding = generate_embedding(query) | |
time.sleep(1) | |
results = collection.aggregate([ | |
{"$vectorSearch": { | |
"queryVector": query_embedding, | |
"path": "course_embedding_hf", | |
"numCandidates": 100, | |
"limit": 10, | |
"index": "CourseSemanticSearch" | |
}}, | |
{ | |
"$addFields": {"score": {"$meta": "vectorSearchScore"}} | |
}, | |
{ | |
"$match": {"score": {"$gte": relevance_threshold}} | |
}, | |
{"$sort": {"score": -1}}, | |
{ | |
"$project": {"vector": 0} | |
}, | |
]) | |
return results | |
# Streamlit UI | |
st.title("Semantic Search Interface") | |
st.subheader("Find the best courses for your query") | |
query = st.text_input("Enter your query here:") | |
# Slider for adjusting relevance threshold | |
slider_value = st.slider( | |
"Adjust Relevance Threshold", | |
min_value=0, | |
max_value=100, | |
value=60, | |
step=1, | |
help="Use this slider to adjust the minimum relevance score for results." | |
) | |
relevance_threshold = slider_value/140 | |
if st.button("Search"): | |
if query: | |
# Generate query embedding | |
with st.spinner("Fetching results..."): | |
results = getSearchResults(query, relevance_threshold) | |
if results: | |
for course in results: | |
print(course) | |
st.markdown( | |
f""" | |
### {course['title']} | |
- **Description:** {course['description']} | |
- **Duration:** {course['duration']} | |
- **Ratings:** {course['ratings']} | |
- **Difficulty:** {course['difficulty']} | |
""" | |
) | |
st.markdown(f"[![Go to Course](https://img.shields.io/badge/Go%20to%20Course-blue)]({course['course_url']})",unsafe_allow_html=True,) | |
st.markdown("---") | |
else: | |
st.warning("No matches found! Try adjusting the relevance slider or using different keywords.") | |
# except Exception as e: | |
# st.markdown(f"rate limit for searching has been completed try after few minutes\n",e) | |
else: | |
st.error("Please enter a query.") | |