import streamlit as st import pymongo import requests import time from dotenv import load_dotenv import os load_dotenv() # file_path = '/content/free_courses.json' uri = os.getenv("URI") hf_token = os.getenv("HF_TOKEN") client = pymongo.MongoClient(uri) embedding_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2" try: db = client.av_courses collection = db.courses # Just trying to fetch the first document (you can adjust the query) sample_doc = collection.find_one() except Exception as e: print(f"Error accessing the database or collection: {e}") # def generate_embedding(text: str) -> list[float]: # response = requests.post( # embedding_url, # headers={"Authorization": f"Bearer {hf_token}"}, # json={"inputs": text}) # if response.status_code != 200: # raise ValueError(f"Request failed with status code {response.status_code}: {response.text}") # return response.json() def generate_embedding(text: str) -> list[float]: for attempt in range(5): # Retry up to 5 times response = requests.post( embedding_url, headers={"Authorization": f"Bearer {hf_token}"}, json={"inputs": text} ) if response.status_code == 200: return response.json() elif response.status_code == 503: time.sleep(5) # Wait before retrying else: raise ValueError(f"Request failed with status code {response.status_code}: {response.text}") raise ValueError("Failed to generate embedding after multiple retries.") # for doc in collection.find({'title':{"$exists": True}}).limit(75): # doc['course_embedding_hf'] = generate_embedding(doc['merged_summary']) # collection.replace_one({'_id': doc['_id']}, doc) # print(doc['_id'], end=" ") # db.avcourses.create_index( # [("course_embedding_hf", pymongo.GEOSPHERE)], # name="CourseSemanticSearch" # ) def getSearchResults(query: str, relevance_threshold: float) -> list[object]: query_embedding = generate_embedding(query) time.sleep(1) results = collection.aggregate([ {"$vectorSearch": { "queryVector": query_embedding, "path": "course_embedding_hf", "numCandidates": 100, "limit": 10, "index": "CourseSemanticSearch" }}, { "$addFields": {"score": {"$meta": "vectorSearchScore"}} }, { "$match": {"score": {"$gte": relevance_threshold}} }, {"$sort": {"score": -1}}, { "$project": {"vector": 0} }, ]) return results # Streamlit UI st.title("Semantic Search Interface") st.subheader("Find the best courses for your query") query = st.text_input("Enter your query here:") # Slider for adjusting relevance threshold slider_value = st.slider( "Adjust Relevance Threshold", min_value=0, max_value=100, value=60, step=1, help="Use this slider to adjust the minimum relevance score for results." ) relevance_threshold = slider_value/140 if st.button("Search"): if query: # Generate query embedding with st.spinner("Fetching results..."): results = getSearchResults(query, relevance_threshold) if results: for course in results: print(course) st.markdown( f""" ### {course['title']} - **Description:** {course['description']} - **Duration:** {course['duration']} - **Ratings:** {course['ratings']} - **Difficulty:** {course['difficulty']} """ ) st.markdown(f"[![Go to Course](https://img.shields.io/badge/Go%20to%20Course-blue)]({course['course_url']})",unsafe_allow_html=True,) st.markdown("---") else: st.warning("No matches found! Try adjusting the relevance slider or using different keywords.") # except Exception as e: # st.markdown(f"rate limit for searching has been completed try after few minutes\n",e) else: st.error("Please enter a query.")