mlkorra commited on
Commit
dcb2841
Β·
verified Β·
1 Parent(s): 357c0c8
Files changed (3) hide show
  1. pages/Classifier.py +288 -0
  2. pages/Home.py +54 -0
  3. pages/Project_Wiki.py +274 -0
pages/Classifier.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.util_classifier import TextClassificationPipeline
3
+ import time
4
+ import requests
5
+ import io
6
+ import pdfplumber
7
+ from urllib.parse import urlparse
8
+ import plotly.graph_objects as go
9
+ import plotly.express as px
10
+
11
+ def validate_url(url):
12
+ try:
13
+ result = urlparse(url)
14
+ return all([result.scheme, result.netloc])
15
+ except:
16
+ return False
17
+
18
+ def download_pdf(url):
19
+ try:
20
+ headers = {
21
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
22
+ 'Accept': 'application/pdf,*/*',
23
+ 'Referer': 'https://www.inter-lux.com/'
24
+ }
25
+
26
+ response = requests.get(url, headers=headers)
27
+ response.raise_for_status()
28
+
29
+ # Verify content type is PDF
30
+ content_type = response.headers.get('content-type', '')
31
+ if 'application/pdf' not in content_type.lower():
32
+ raise ValueError(f"URL does not point to a PDF file. Content-Type: {content_type}")
33
+
34
+ return io.BytesIO(response.content)
35
+ except Exception as e:
36
+ st.error(f"Download error: {str(e)}")
37
+ return None
38
+
39
+ def extract_text(pdf_file):
40
+ try:
41
+ # Reset file pointer
42
+ pdf_file.seek(0)
43
+
44
+ with pdfplumber.open(pdf_file) as pdf:
45
+ text = ""
46
+ for page in pdf.pages:
47
+ extracted = page.extract_text()
48
+ if extracted:
49
+ text += extracted + "\n"
50
+
51
+ if not text.strip():
52
+ raise ValueError("No text could be extracted from the PDF")
53
+
54
+ return text.strip()
55
+ except Exception as e:
56
+ st.error(f"Text extraction error: {str(e)}")
57
+ return None
58
+
59
+ def main():
60
+ st.title("🎯 Document Classifier")
61
+
62
+ # Model selection
63
+
64
+
65
+ method = "bertbased"
66
+
67
+ # Initialize classifier
68
+ classifier = TextClassificationPipeline(method=method)
69
+
70
+ # File input tabs
71
+ tab1, tab2 = st.tabs(["πŸ”— URL Input", "πŸ“ File Upload"])
72
+
73
+ with tab1:
74
+ url = st.text_input("Enter PDF URL")
75
+ process_btn = st.button("Classify Document", key="url_classify")
76
+
77
+ if process_btn and url:
78
+ if not validate_url(url):
79
+ st.error("Please enter a valid URL")
80
+ return
81
+
82
+ progress_container = st.container()
83
+
84
+ with progress_container:
85
+ # Step 1: Downloading
86
+ with st.spinner("Downloading PDF..."):
87
+ pdf_file = download_pdf(url)
88
+ if pdf_file is None:
89
+ return
90
+ st.success("PDF downloaded successfully!")
91
+
92
+ # Step 2: Extracting Text
93
+ with st.spinner("Extracting text from PDF..."):
94
+ text = extract_text(pdf_file)
95
+ if text is None or len(text.strip()) == 0:
96
+ return
97
+ st.success("Text extracted successfully!")
98
+
99
+ with st.expander("View Extracted Text"):
100
+ st.text(text[:500] + "..." if len(text) > 500 else text)
101
+
102
+ # Step 3: Classification
103
+ with st.spinner("Classifying document..."):
104
+ result = classifier.predict(text, return_probability=True)
105
+ if isinstance(result, list):
106
+ result = result[0]
107
+
108
+ # Display results
109
+
110
+ def create_gauge_chart(confidence):
111
+ """Create a gauge chart for confidence score"""
112
+ fig = go.Figure(go.Indicator(
113
+ mode = "gauge+number+delta",
114
+ value = confidence * 100,
115
+ domain = {'x': [0, 1], 'y': [0, 1]},
116
+ gauge = {
117
+ 'axis': {'range': [None, 100], 'tickwidth': 1, 'tickcolor': "darkblue"},
118
+ 'bar': {'color': "darkblue"},
119
+ 'bgcolor': "white",
120
+ 'borderwidth': 2,
121
+ 'bordercolor': "gray",
122
+ 'steps': [
123
+ {'range': [0, 50], 'color': '#FF9999'},
124
+ {'range': [50, 75], 'color': '#FFCC99'},
125
+ {'range': [75, 100], 'color': '#99FF99'}
126
+ ],
127
+ },
128
+ title = {'text': "Confidence Score"}
129
+ ))
130
+
131
+ fig.update_layout(
132
+ height=300,
133
+ margin=dict(l=10, r=10, t=50, b=10),
134
+ paper_bgcolor='rgba(0,0,0,0)',
135
+ font={'color': "darkblue", 'family': "Arial"}
136
+ )
137
+ return fig
138
+
139
+ def create_probability_chart(probabilities):
140
+ """Create a horizontal bar chart for probability distribution"""
141
+ labels = list(probabilities.keys())
142
+ values = list(probabilities.values())
143
+
144
+ fig = go.Figure()
145
+
146
+ # Add bars
147
+ fig.add_trace(go.Bar(
148
+ y=labels,
149
+ x=[v * 100 for v in values],
150
+ orientation='h',
151
+ marker=dict(
152
+ color=[px.colors.sequential.Blues[i] for i in range(2, len(labels) + 2)],
153
+ line=dict(color='rgba(0,0,0,0.8)', width=2)
154
+ ),
155
+ text=[f'{v:.1f}%' for v in [v * 100 for v in values]],
156
+ textposition='auto',
157
+ ))
158
+
159
+ # Update layout
160
+ fig.update_layout(
161
+ title=dict(
162
+ text='Probability Distribution',
163
+ y=0.95,
164
+ x=0.5,
165
+ xanchor='center',
166
+ yanchor='top',
167
+ font=dict(size=20, color='darkblue')
168
+ ),
169
+ xaxis_title="Probability (%)",
170
+ yaxis_title="Categories",
171
+ height=400,
172
+ margin=dict(l=20, r=20, t=70, b=20),
173
+ paper_bgcolor='rgba(0,0,0,0)',
174
+ plot_bgcolor='rgba(0,0,0,0)',
175
+ font=dict(family="Arial", size=14),
176
+ showlegend=False
177
+ )
178
+
179
+ # Update axes
180
+ fig.update_xaxes(
181
+ range=[0, 100],
182
+ gridcolor='rgba(0,0,0,0.1)',
183
+ zerolinecolor='rgba(0,0,0,0.2)'
184
+ )
185
+ fig.update_yaxes(
186
+ gridcolor='rgba(0,0,0,0.1)',
187
+ zerolinecolor='rgba(0,0,0,0.2)'
188
+ )
189
+
190
+ return fig
191
+
192
+ # Update the results display section
193
+ def display_results(result):
194
+ """Display classification results with modern visualizations"""
195
+
196
+ # Create three columns for the results
197
+ col1, col2 = st.columns([1, 2])
198
+
199
+ with col1:
200
+ # Predicted Category Card
201
+ st.markdown("""
202
+ <div style='
203
+ background-color: white;
204
+ padding: 20px;
205
+ border-radius: 10px;
206
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
207
+ text-align: center;
208
+ margin-bottom: 20px;
209
+ '>
210
+ <h4 style='color: #1f77b4; margin-bottom: 10px;'>Predicted Category</h4>
211
+ <p style='
212
+ font-size: 24px;
213
+ font-weight: bold;
214
+ color: #2c3e50;
215
+ margin: 0;
216
+ padding: 10px;
217
+ background-color: #f8f9fa;
218
+ border-radius: 5px;
219
+ '>{}</p>
220
+ </div>
221
+ """.format(result['predicted_label']), unsafe_allow_html=True)
222
+
223
+ # Confidence Gauge
224
+ st.plotly_chart(create_gauge_chart(result['confidence']), use_container_width=True)
225
+
226
+ with col2:
227
+ # Probability Distribution
228
+ st.plotly_chart(create_probability_chart(result['probabilities']), use_container_width=True)
229
+
230
+ # Add metadata section
231
+ with st.expander("πŸ“Š Classification Details"):
232
+ st.markdown(f"""
233
+ - **Model Type**: {result['model_type'].title()}
234
+ - **Document Length**: {len(result['text'])} characters
235
+ """)
236
+
237
+ # Update the main classification results section
238
+ # Replace the existing results display with:
239
+ st.markdown("### πŸ“Š Classification Results")
240
+ display_results(result)
241
+
242
+
243
+ with tab2:
244
+ uploaded_file = st.file_uploader("Upload PDF file", type="pdf")
245
+ process_btn = st.button("Classify Document", key="file_classify")
246
+
247
+ if process_btn and uploaded_file:
248
+ with st.spinner("Processing uploaded PDF..."):
249
+ text = extract_text(uploaded_file)
250
+ if text is None:
251
+ return
252
+
253
+ result = classifier.predict(text, return_probability=True)
254
+ if isinstance(result, list):
255
+ result = result[0]
256
+
257
+ # Display results (same as URL tab)
258
+ st.markdown("### πŸ“Š Classification Results")
259
+
260
+ confidence = result['confidence']
261
+ st.markdown(f"""
262
+ <div class="confidence-meter">
263
+ <div class="meter-fill" style="width: {confidence*100}%"></div>
264
+ <span class="meter-text">{confidence:.1%} Confident</span>
265
+ </div>
266
+ """, unsafe_allow_html=True)
267
+
268
+ st.markdown(f"""
269
+ <div class="result-card">
270
+ <h4>Predicted Category</h4>
271
+ <p class="prediction">{result['predicted_label']}</p>
272
+ </div>
273
+ """, unsafe_allow_html=True)
274
+
275
+ st.markdown("#### Probability Distribution")
276
+ for label, prob in result['probabilities'].items():
277
+ st.markdown(f"""
278
+ <div class="prob-bar">
279
+ <span class="label">{label}</span>
280
+ <div class="bar">
281
+ <div class="fill" style="width: {prob*100}%"></div>
282
+ </div>
283
+ <span class="value">{prob:.1%}</span>
284
+ </div>
285
+ """, unsafe_allow_html=True)
286
+
287
+
288
+ main()
pages/Home.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+
4
+ st.title("πŸ—οΈ ConstructAI - Smart Document Classifier")
5
+
6
+ # Hero section
7
+ st.markdown("""
8
+ <div class="hero-section">
9
+ <h4>Automate your construction document classification with AI-powered accuracy</h4>
10
+ </div>
11
+ """, unsafe_allow_html=True)
12
+
13
+ # Key Features
14
+ st.markdown("### πŸš€ Key Features")
15
+ col1, col2, col3 = st.columns(3)
16
+
17
+ with col1:
18
+ st.markdown("""
19
+ <div class="feature-card">
20
+ <h4>🎯 Precise Classification</h4>
21
+ <p>Advanced AI models for accurate document categorization</p>
22
+ </div>
23
+ """, unsafe_allow_html=True)
24
+
25
+ with col2:
26
+ st.markdown("""
27
+ <div class="feature-card">
28
+ <h4>⚑ Instant Results</h4>
29
+ <p>Get classifications in seconds, not hours</p>
30
+ </div>
31
+ """, unsafe_allow_html=True)
32
+
33
+ with col3:
34
+ st.markdown("""
35
+ <div class="feature-card">
36
+ <h4>πŸ“Š Detailed Analytics</h4>
37
+ <p>Confidence scores and detailed predictions</p>
38
+ </div>
39
+ """, unsafe_allow_html=True)
40
+
41
+ # Use Cases
42
+
43
+ st.divider()
44
+
45
+ # Call to Action
46
+ st.markdown("""
47
+ <div class="cta-section">
48
+ <h3>Ready to Get Started?</h3>
49
+ <p>Try our classifier now and experience the power of AI in construction document management.</p>
50
+ </div>
51
+ """, unsafe_allow_html=True)
52
+
53
+ if st.button("Try Classifier Now β†’", key="cta_button"):
54
+ st.switch_page("pages/Classifier.py")
pages/Project_Wiki.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+ def main():
6
+ st.title("πŸ“š Project Documentation")
7
+
8
+ # Custom CSS for better styling
9
+ st.markdown("""
10
+ <style>
11
+ .question-card {
12
+ background-color: #f8f9fa;
13
+ padding: 20px;
14
+ border-radius: 10px;
15
+ border-left: 5px solid #1f77b4;
16
+ margin: 20px 0;
17
+ }
18
+ .question {
19
+ color: #1f77b4;
20
+ font-size: 1.2em;
21
+ font-weight: bold;
22
+ margin-bottom: 15px;
23
+ }
24
+ .answer {
25
+ color: #2c3e50;
26
+ line-height: 1.6;
27
+ }
28
+ </style>
29
+ """, unsafe_allow_html=True)
30
+
31
+ # Q1: Development Timeline
32
+ st.markdown("""
33
+ <div class="question-card">
34
+ <div class="question">⏱️ Q1: How long did it take to solve the problem?</div>
35
+ <div class="answer">
36
+ The solution was developed in approximately <b>5 hours</b> (excluding data collection and model training phases).
37
+ </div>
38
+ </div>
39
+ """, unsafe_allow_html=True)
40
+
41
+ # Q2: Solution Explanation
42
+ st.markdown("""
43
+ <div class="question-card">
44
+ <div class="question">πŸ” Q2: Can you explain your solution approach?</div>
45
+ <div class="answer">
46
+ The solution implements a multi-stage document classification pipeline:
47
+ <br><br>
48
+ <b>1. Direct URL Text Approach:</b>
49
+ <ul>
50
+ <li>Initially considered direct URL text extraction</li>
51
+ <li>Found limitations in accuracy and reliability</li>
52
+ </ul>
53
+ <br>
54
+ <b>2. Baseline Approach (ML Model):</b>
55
+ <ul>
56
+ <li>Implemented TF-IDF vectorization</li>
57
+ <li>Used Logistic Regression for classification</li>
58
+ <li>Provided quick and efficient results</li>
59
+ </ul>
60
+ <br>
61
+ <b>3. (DL Model):</b>
62
+ <ul>
63
+ <li>Utilized BERT-based model architecture</li>
64
+ <li>Fine-tuned on construction document dataset</li>
65
+ <li>Achieved superior accuracy and context understanding</li>
66
+ </ul>
67
+ </div>
68
+ </div>
69
+ """, unsafe_allow_html=True)
70
+
71
+ # Q3: Model Selection
72
+ st.markdown("""
73
+ <div class="question-card">
74
+ <div class="question">πŸ€– Q3: Which models did you use and why?</div>
75
+ <div class="answer">
76
+ Implemented baseline using TF-IDF and Logistic Regression and then used BERT-based model:
77
+ <br><br>
78
+ <b>Baseline Model:</b>
79
+ <ul>
80
+ <li>TF-IDF + Logistic Regression</li>
81
+ <li>Quick inference time</li>
82
+ <li>Resource-efficient</li>
83
+ </ul>
84
+ <br>
85
+ <b>BERT Model:</b>
86
+ <ul>
87
+ <li>Fine-tuned on 1800 samples text</li>
88
+ <li>Better context understanding</li>
89
+ <li>Better handling of complex documents</li>
90
+ </ul>
91
+ </div>
92
+ </div>
93
+ """, unsafe_allow_html=True)
94
+
95
+ # Q4: Limitations and Improvements
96
+ st.markdown("""
97
+ <div class="question-card">
98
+ <div class="question">⚠️ Q4: What are the current limitations and potential improvements?</div>
99
+ <div class="answer">
100
+ <b>Current Implementation & Limitations:</b>
101
+ <ul>
102
+ <li>~25% of dataset URLs were inaccessible</li>
103
+ <li>Used Threadpooling for parallel downloading of train and test documents</li>
104
+ </ul>
105
+ <br>
106
+ <b>Proposed Improvements:</b>
107
+ <ul>
108
+ <li>Use latest LLMs like GPT-4o, Claude 3.5 Sonnet etc with few shot prompting to speed up the development process</li>
109
+ <li>Optimize inference pipeline for faster processing using distilled models like DistilBERT, or the last BERT based model - ModernBERT to compare the performance</li>
110
+ <li>Add support for more document formats</li>
111
+ </ul>
112
+ </div>
113
+ </div>
114
+ """, unsafe_allow_html=True)
115
+
116
+ # Q5: Model Performance
117
+ st.markdown("""
118
+ <div class="question-card">
119
+ <div class="question">πŸ“Š Q5: What is the model's performance on test data?</div>
120
+ <div class="answer">
121
+ <b>BERT Model Performance:</b>
122
+ <br><br>
123
+ <div style="overflow-x: auto;">
124
+ <table style="
125
+ width: 100%;
126
+ border-collapse: collapse;
127
+ margin: 20px 0;
128
+ font-size: 0.9em;
129
+ font-family: sans-serif;
130
+ box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
131
+ border-radius: 5px;
132
+ ">
133
+ <thead>
134
+ <tr style="
135
+ background-color: #1f77b4;
136
+ color: white;
137
+ text-align: left;
138
+ ">
139
+ <th style="padding: 12px 15px;">Category</th>
140
+ <th style="padding: 12px 15px;">Precision</th>
141
+ <th style="padding: 12px 15px;">Recall</th>
142
+ <th style="padding: 12px 15px;">F1-Score</th>
143
+ <th style="padding: 12px 15px;">Support</th>
144
+ </tr>
145
+ </thead>
146
+ <tbody>
147
+ <tr style="border-bottom: 1px solid #dddddd;">
148
+ <td style="padding: 12px 15px;"><b>Cable</b></td>
149
+ <td style="padding: 12px 15px;">1.00</td>
150
+ <td style="padding: 12px 15px;">1.00</td>
151
+ <td style="padding: 12px 15px;">1.00</td>
152
+ <td style="padding: 12px 15px;">92</td>
153
+ </tr>
154
+ <tr style="border-bottom: 1px solid #dddddd; background-color: #f3f3f3;">
155
+ <td style="padding: 12px 15px;"><b>Fuses</b></td>
156
+ <td style="padding: 12px 15px;">0.95</td>
157
+ <td style="padding: 12px 15px;">1.00</td>
158
+ <td style="padding: 12px 15px;">0.98</td>
159
+ <td style="padding: 12px 15px;">42</td>
160
+ </tr>
161
+ <tr style="border-bottom: 1px solid #dddddd;">
162
+ <td style="padding: 12px 15px;"><b>Lighting</b></td>
163
+ <td style="padding: 12px 15px;">0.94</td>
164
+ <td style="padding: 12px 15px;">1.00</td>
165
+ <td style="padding: 12px 15px;">0.97</td>
166
+ <td style="padding: 12px 15px;">74</td>
167
+ </tr>
168
+ <tr style="border-bottom: 1px solid #dddddd; background-color: #f3f3f3;">
169
+ <td style="padding: 12px 15px;"><b>Others</b></td>
170
+ <td style="padding: 12px 15px;">1.00</td>
171
+ <td style="padding: 12px 15px;">0.92</td>
172
+ <td style="padding: 12px 15px;">0.96</td>
173
+ <td style="padding: 12px 15px;">83</td>
174
+ </tr>
175
+ </tbody>
176
+ <tfoot>
177
+ <tr style="background-color: #f8f9fa; font-weight: bold; border-top: 2px solid #dddddd;">
178
+ <td style="padding: 12px 15px;">Accuracy</td>
179
+ <td style="padding: 12px 15px;" colspan="3">0.98</td>
180
+ <td style="padding: 12px 15px;">291</td>
181
+ </tr>
182
+ <tr style="background-color: #f8f9fa; color: #666;">
183
+ <td style="padding: 12px 15px;">Macro Avg</td>
184
+ <td style="padding: 12px 15px;">0.97</td>
185
+ <td style="padding: 12px 15px;">0.98</td>
186
+ <td style="padding: 12px 15px;">0.98</td>
187
+ <td style="padding: 12px 15px;">291</td>
188
+ </tr>
189
+ <tr style="background-color: #f8f9fa; color: #666;">
190
+ <td style="padding: 12px 15px;">Weighted Avg</td>
191
+ <td style="padding: 12px 15px;">0.98</td>
192
+ <td style="padding: 12px 15px;">0.98</td>
193
+ <td style="padding: 12px 15px;">0.98</td>
194
+ <td style="padding: 12px 15px;">291</td>
195
+ </tr>
196
+ </tfoot>
197
+ </table>
198
+ </div>
199
+ </div>
200
+ </div>
201
+ """, unsafe_allow_html=True)
202
+
203
+ st.markdown("""
204
+ <div style='
205
+ background-color: #f8f9fa;
206
+ padding: 20px;
207
+ border-radius: 10px;
208
+ border-left: 5px solid #1f77b4;
209
+ margin: 20px 0;
210
+ '>
211
+ ✨ Perfect performance (1.00) for Cable category<br>
212
+ πŸ“ˆ High recall (1.00) across most categories<br>
213
+ 🎯 Overall accuracy of 98%<br>
214
+ βš–οΈ Balanced performance across all metrics
215
+ </div>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Q6: Metric Selection
219
+ st.markdown("""
220
+ <div class="question-card">
221
+ <div class="question">πŸ“ˆ Q6: Why did you choose these particular metrics?</div>
222
+ <div class="answer">
223
+ Our metric selection was driven by the dataset characteristics:
224
+ <br><br>
225
+ <b>Key Considerations:</b>
226
+ <ul>
227
+ <li>Dataset has mild class imbalance (Imbalance Ratio: 2.36)</li>
228
+ <li>Need for balanced evaluation across all classes</li>
229
+ </ul>
230
+ <br>
231
+ <b>Selected Metrics:</b>
232
+ <ul>
233
+ <li><b>Precision:</b> Critical for minimizing false positives</li>
234
+ <li><b>Recall:</b> Important for catching all instances of each class</li>
235
+ <li><b>F1-Score:</b> Provides balanced evaluation of both metrics</li>
236
+ <li><b>Weighted Average:</b> Accounts for class imbalance</li>
237
+ </ul>
238
+ </div>
239
+ </div>
240
+ """, unsafe_allow_html=True)
241
+
242
+ # Performance Visualization
243
+ st.markdown("### πŸ“Š Model Performance Comparison")
244
+ metrics = {
245
+ 'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
246
+ 'Baseline': [0.85, 0.83, 0.84, 0.83],
247
+ 'BERT': [0.98, 0.97, 0.98, 0.98]
248
+ }
249
+
250
+ df = pd.DataFrame(metrics)
251
+
252
+ fig = px.bar(
253
+ df,
254
+ x='Metric',
255
+ y=['Baseline', 'BERT'],
256
+ barmode='group',
257
+ title='Model Performance Comparison',
258
+ color_discrete_sequence=['#2ecc71', '#3498db'],
259
+ template='plotly_white'
260
+ )
261
+
262
+ fig.update_layout(
263
+ title_x=0.5,
264
+ title_font_size=20,
265
+ legend_title_text='Model Type',
266
+ xaxis_title="Evaluation Metric",
267
+ yaxis_title="Score",
268
+ bargap=0.2,
269
+ height=500
270
+ )
271
+
272
+ st.plotly_chart(fig, use_container_width=True)
273
+
274
+ main()