Saarthak2002 commited on
Commit
1ac3192
Β·
verified Β·
1 Parent(s): d2331c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -105
app.py CHANGED
@@ -1,105 +1,108 @@
1
- import pandas as pd
2
- from sentence_transformers import SentenceTransformer, util
3
- from transformers import pipeline
4
- import torch
5
- import gradio as gr
6
-
7
- # Load the dataset
8
- df = pd.read_csv(r"C:\Users\Saarthak\Desktop\Saarthak_assignment\analytics_vidhya_data.csv", encoding='ISO-8859-1')
9
-
10
- # Load the pre-trained model for embeddings (using SentenceTransformers)
11
- model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
12
-
13
- # Combine title and description to create a full text for each course
14
- df['full_text'] = df.iloc[:,0] + " " + df.iloc[:,1] + " " + df['Instructor Name'] + " " + str(df['Rating']) + " " + df['Category']
15
-
16
- # Convert full course texts into embeddings
17
- course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)
18
-
19
- # Function to expand the query using paraphrasing
20
- def expand_query(query):
21
- paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
22
- expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
23
- return [q['generated_text'] for q in expanded_queries]
24
-
25
- # Function to search for the most relevant courses
26
- def search_courses(query, level_filter=None, category_filter=None, top_k=3):
27
- # Step 1: Expand the query using paraphrasing
28
- expanded_queries = expand_query(query)
29
-
30
- # Step 2: Initialize an array to store all similarities
31
- all_similarities = []
32
-
33
- for expanded_query in expanded_queries:
34
- # Convert each expanded query into an embedding
35
- query_embedding = model.encode(expanded_query, convert_to_tensor=True)
36
-
37
- # Compute cosine similarities between the query embedding and course embeddings
38
- similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
39
-
40
- # Append to the list of all similarities
41
- all_similarities.append(similarities)
42
-
43
- # Step 3: Convert the list of tensors to a single tensor by taking the maximum similarity for each course
44
- aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]
45
-
46
- # Step 4: Apply filters
47
- filtered_df = df.copy()
48
- if level_filter:
49
- filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
50
- if category_filter:
51
- filtered_df = filtered_df[filtered_df['Category'] == category_filter]
52
-
53
- if filtered_df.empty:
54
- return "<p>No matching courses found.</p>"
55
-
56
- # Recalculate similarities for the filtered data
57
- filtered_similarities = aggregated_similarities[filtered_df.index]
58
-
59
- # Step 5: Get top_k most similar courses
60
- top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))
61
-
62
- # Prepare the output as clickable links
63
- results = []
64
- for idx in top_results.indices:
65
- idx = int(idx)
66
- course_title = filtered_df.iloc[idx]['Course Title']
67
- course_description = filtered_df.iloc[idx,1]
68
- course_url = filtered_df.iloc[idx,-1]
69
-
70
-
71
- # Format the result as a clickable hyperlink using raw HTML
72
- course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
73
- results.append(f"<strong>{course_link}</strong><br>{course_description}<br><br>")
74
-
75
- # Combine all results into an HTML formatted list
76
- return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"
77
-
78
- # Create Gradio UI
79
- def create_gradio_interface():
80
- with gr.Blocks() as demo:
81
- gr.Markdown("# πŸ“š Analytics Vidhya Free Courses")
82
- gr.Markdown("Enter your query and use filters to narrow down the search.")
83
-
84
- # Input elements
85
- query = gr.Textbox(label="πŸ” Search for a course", placeholder="Enter course topic or description")
86
-
87
- # Filters (in a collapsible form)
88
- with gr.Accordion("πŸ” Filters", open=False):
89
- level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced"], label="πŸ“š Course Level", multiselect=False)
90
- category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP"], label="πŸ“‚ Category", multiselect=False)
91
-
92
- # Search button
93
- search_button = gr.Button("Search")
94
-
95
- # Output HTML for displaying results
96
- output = gr.HTML(label="Search Results")
97
-
98
- # On button click, trigger the search function
99
- search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)
100
-
101
- return demo
102
-
103
- # Launch Gradio interface
104
- demo = create_gradio_interface()
105
- demo.launch(share=True, debug=True)
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from transformers import pipeline
4
+ import torch
5
+ import gradio as gr
6
+
7
+ # Use the relative path where the CSV is uploaded
8
+ csv_file_path = os.path.join(os.getcwd(), 'analytics_vidhya_data.csv')
9
+
10
+ # Load the dataset
11
+ df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')
12
+
13
+ # Load the pre-trained model for embeddings (using SentenceTransformers)
14
+ model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
15
+
16
+ # Combine title and description to create a full text for each course
17
+ df['full_text'] = df.iloc[:,0] + " " + df.iloc[:,1] + " " + df['Instructor Name'] + " " + str(df['Rating']) + " " + df['Category']
18
+
19
+ # Convert full course texts into embeddings
20
+ course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)
21
+
22
+ # Function to expand the query using paraphrasing
23
+ def expand_query(query):
24
+ paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
25
+ expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
26
+ return [q['generated_text'] for q in expanded_queries]
27
+
28
+ # Function to search for the most relevant courses
29
+ def search_courses(query, level_filter=None, category_filter=None, top_k=3):
30
+ # Step 1: Expand the query using paraphrasing
31
+ expanded_queries = expand_query(query)
32
+
33
+ # Step 2: Initialize an array to store all similarities
34
+ all_similarities = []
35
+
36
+ for expanded_query in expanded_queries:
37
+ # Convert each expanded query into an embedding
38
+ query_embedding = model.encode(expanded_query, convert_to_tensor=True)
39
+
40
+ # Compute cosine similarities between the query embedding and course embeddings
41
+ similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
42
+
43
+ # Append to the list of all similarities
44
+ all_similarities.append(similarities)
45
+
46
+ # Step 3: Convert the list of tensors to a single tensor by taking the maximum similarity for each course
47
+ aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]
48
+
49
+ # Step 4: Apply filters
50
+ filtered_df = df.copy()
51
+ if level_filter:
52
+ filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
53
+ if category_filter:
54
+ filtered_df = filtered_df[filtered_df['Category'] == category_filter]
55
+
56
+ if filtered_df.empty:
57
+ return "<p>No matching courses found.</p>"
58
+
59
+ # Recalculate similarities for the filtered data
60
+ filtered_similarities = aggregated_similarities[filtered_df.index]
61
+
62
+ # Step 5: Get top_k most similar courses
63
+ top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))
64
+
65
+ # Prepare the output as clickable links
66
+ results = []
67
+ for idx in top_results.indices:
68
+ idx = int(idx)
69
+ course_title = filtered_df.iloc[idx]['Course Title']
70
+ course_description = filtered_df.iloc[idx,1]
71
+ course_url = filtered_df.iloc[idx,-1]
72
+
73
+
74
+ # Format the result as a clickable hyperlink using raw HTML
75
+ course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
76
+ results.append(f"<strong>{course_link}</strong><br>{course_description}<br><br>")
77
+
78
+ # Combine all results into an HTML formatted list
79
+ return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"
80
+
81
+ # Create Gradio UI
82
+ def create_gradio_interface():
83
+ with gr.Blocks() as demo:
84
+ gr.Markdown("# πŸ“š Analytics Vidhya Free Courses")
85
+ gr.Markdown("Enter your query and use filters to narrow down the search.")
86
+
87
+ # Input elements
88
+ query = gr.Textbox(label="πŸ” Search for a course", placeholder="Enter course topic or description")
89
+
90
+ # Filters (in a collapsible form)
91
+ with gr.Accordion("πŸ” Filters", open=False):
92
+ level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced"], label="πŸ“š Course Level", multiselect=False)
93
+ category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP"], label="πŸ“‚ Category", multiselect=False)
94
+
95
+ # Search button
96
+ search_button = gr.Button("Search")
97
+
98
+ # Output HTML for displaying results
99
+ output = gr.HTML(label="Search Results")
100
+
101
+ # On button click, trigger the search function
102
+ search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)
103
+
104
+ return demo
105
+
106
+ # Launch Gradio interface
107
+ demo = create_gradio_interface()
108
+ demo.launch(share=True, debug=True)