SearchCourses / app.py
kartiksrma's picture
Update app.py
01ec9c3 verified
import streamlit as st
import pymongo
import requests
import time
from dotenv import load_dotenv
import os
load_dotenv()
# file_path = '/content/free_courses.json'
uri = os.getenv("URI")
hf_token = os.getenv("HF_TOKEN")
client = pymongo.MongoClient(uri)
embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
try:
db = client.av_courses
collection = db.courses
# Just trying to fetch the first document (you can adjust the query)
sample_doc = collection.find_one()
except Exception as e:
print(f"Error accessing the database or collection: {e}")
# def generate_embedding(text: str) -> list[float]:
# response = requests.post(
# embedding_url,
# headers={"Authorization": f"Bearer {hf_token}"},
# json={"inputs": text})
# if response.status_code != 200:
# raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")
# return response.json()
def generate_embedding(text: str) -> list[float]:
for attempt in range(5): # Retry up to 5 times
response = requests.post(
embedding_url,
headers={"Authorization": f"Bearer {hf_token}"},
json={"inputs": text}
)
if response.status_code == 200:
return response.json()
elif response.status_code == 503:
time.sleep(5) # Wait before retrying
else:
raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")
raise ValueError("Failed to generate embedding after multiple retries.")
# for doc in collection.find({'title':{"$exists": True}}).limit(75):
# doc['course_embedding_hf'] = generate_embedding(doc['merged_summary'])
# collection.replace_one({'_id': doc['_id']}, doc)
# print(doc['_id'], end=" ")
# db.avcourses.create_index(
# [("course_embedding_hf", pymongo.GEOSPHERE)],
# name="CourseSemanticSearch"
# )
def getSearchResults(query: str, relevance_threshold: float) -> list[object]:
query_embedding = generate_embedding(query)
time.sleep(1)
results = collection.aggregate([
{"$vectorSearch": {
"queryVector": query_embedding,
"path": "course_embedding_hf",
"numCandidates": 100,
"limit": 10,
"index": "CourseSemanticSearch"
}},
{
"$addFields": {"score": {"$meta": "vectorSearchScore"}}
},
{
"$match": {"score": {"$gte": relevance_threshold}}
},
{"$sort": {"score": -1}},
{
"$project": {"vector": 0}
},
])
return results
# Streamlit UI
st.title("Semantic Search Interface")
st.subheader("Find the best courses for your query")
query = st.text_input("Enter your query here:")
# Slider for adjusting relevance threshold
slider_value = st.slider(
"Adjust Relevance Threshold",
min_value=0,
max_value=100,
value=60,
step=1,
help="Use this slider to adjust the minimum relevance score for results."
)
relevance_threshold = slider_value/140
if st.button("Search"):
if query:
# Generate query embedding
with st.spinner("Fetching results..."):
results = getSearchResults(query, relevance_threshold)
if results:
for course in results:
print(course)
st.markdown(
f"""
### {course['title']}
- **Description:** {course['description']}
- **Duration:** {course['duration']}
- **Ratings:** {course['ratings']}
- **Difficulty:** {course['difficulty']}
"""
)
st.markdown(f"[![Go to Course](https://img.shields.io/badge/Go%20to%20Course-blue)]({course['course_url']})",unsafe_allow_html=True,)
st.markdown("---")
else:
st.warning("No matches found! Try adjusting the relevance slider or using different keywords.")
# except Exception as e:
# st.markdown(f"rate limit for searching has been completed try after few minutes\n",e)
else:
st.error("Please enter a query.")