import streamlit as st import pandas as pd import plotly.express as px def main(): st.title("📚 Project Documentation") # Custom CSS for better styling st.markdown(""" <style> .question-card { background-color: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 5px solid #1f77b4; margin: 20px 0; } .question { color: #1f77b4; font-size: 1.2em; font-weight: bold; margin-bottom: 15px; } .answer { color: #2c3e50; line-height: 1.6; } </style> """, unsafe_allow_html=True) # Q1: Development Timeline st.markdown(""" <div class="question-card"> <div class="question">⏱️ Q1: How long did it take to solve the problem?</div> <div class="answer"> The solution was developed in approximately <b>5 hours</b> (excluding data collection and model training phases). </div> </div> """, unsafe_allow_html=True) # Q2: Solution Explanation # Q2: Solution Explanation st.markdown(""" <div class="question-card"> <div class="question">🔍 Q2: Can you explain your solution approach?</div> <div class="answer"> The solution implements a multi-stage document classification pipeline: <br><br> <b>1. Data Collection & Processing:</b> <ul> <li>Dataset: 2500+ training URLs and 250+ test URLs</li> <li>Implemented ThreadPooling with 20 workers for parallel processing</li> <li>Reduced download time to ~40 minutes (vs. 3+ hours sequential)</li> <li>Used PDFPlumber for robust text extraction</li> </ul> <br> <b>2. Model Development Pipeline:</b> <ul> <li><i>Baseline Approach:</i> <ul> <li>TF-IDF vectorization for text representation</li> <li>Logistic Regression for initial classification</li> <li>Quick inference and resource-efficient</li> </ul> </li> <br> <li><i>Advanced Approach:</i> <ul> <li>BERT-based architecture for deep learning</li> <li>Fine-tuned on construction document dataset</li> <li>Superior context understanding and accuracy</li> </ul> </li> </ul> <br> <b>3. Evaluation Strategy:</b> <ul> <li>Comprehensive metric suite (Precision, Recall, F1)</li> <li>Special consideration for class imbalance</li> <li>Comparative analysis between baseline and BERT</li> </ul> <br> <b>4. Deployment & Demo:</b> <ul> <li>Streamlit-based interactive web interface</li> <li>Real-time document classification</li> <li>Comprehensive project documentation</li> <li>Performance visualization and analytics</li> </ul> <br> <div style=' background-color: #e8f4f8; padding: 15px; border-radius: 5px; border-left: 4px solid #1f77b4; '> <b>💡 Key implementation:</b> The parallel processing implementation significantly reduced data preparation time, allowing for faster iteration and model experimentation. This, combined with the dual-model approach, provides both efficiency and accuracy in document classification. </div> </div> </div> """, unsafe_allow_html=True) # Q3: Model Selection st.markdown(""" <div class="question-card"> <div class="question">🤖 Q3: Which models did you use and why?</div> <div class="answer"> Implemented baseline using TF-IDF and Logistic Regression and then used BERT-based model: <br><br> <b>Baseline Model:</b> <ul> <li>TF-IDF + Logistic Regression</li> <li>Quick inference time</li> <li>Resource-efficient</li> </ul> <br> <b>BERT Model:</b> <ul> <li>Fine-tuned on 1800 samples text</li> <li>Better context understanding</li> <li>Better handling of complex documents</li> </ul> </div> </div> """, unsafe_allow_html=True) # Q4: Limitations and Improvements st.markdown(""" <div class="question-card"> <div class="question">⚠️ Q4: What are the current limitations and potential improvements?</div> <div class="answer"> <b>Current Implementation & Limitations:</b> <ul> <li>~25% of dataset URLs were inaccessible</li> <li>Used Threadpooling for parallel downloading of train and test documents</li> </ul> <br> <b>Proposed Improvements:</b> <ul> <li>Use latest LLMs like GPT-4o, Claude 3.5 Sonnet etc with few shot prompting to speed up the development process</li> <li>Optimize inference pipeline for faster processing using distilled models like DistilBERT, or the last BERT based model - ModernBERT to compare the performance</li> <li>Add support for more document formats</li> </ul> </div> </div> """, unsafe_allow_html=True) # Q5: Model Performance st.markdown(""" <div class="question-card"> <div class="question">📊 Q5: What is the model's performance on test data?</div> <div class="answer"> <b>BERT Model Performance:</b> <br><br> <div style="overflow-x: auto;"> <table style=" width: 100%; border-collapse: collapse; margin: 20px 0; font-size: 0.9em; font-family: sans-serif; box-shadow: 0 0 20px rgba(0, 0, 0, 0.15); border-radius: 5px; "> <thead> <tr style=" background-color: #1f77b4; color: white; text-align: left; "> <th style="padding: 12px 15px;">Category</th> <th style="padding: 12px 15px;">Precision</th> <th style="padding: 12px 15px;">Recall</th> <th style="padding: 12px 15px;">F1-Score</th> <th style="padding: 12px 15px;">Support</th> </tr> </thead> <tbody> <tr style="border-bottom: 1px solid #dddddd;"> <td style="padding: 12px 15px;"><b>Cable</b></td> <td style="padding: 12px 15px;">1.00</td> <td style="padding: 12px 15px;">1.00</td> <td style="padding: 12px 15px;">1.00</td> <td style="padding: 12px 15px;">92</td> </tr> <tr style="border-bottom: 1px solid #dddddd; background-color: #f3f3f3;"> <td style="padding: 12px 15px;"><b>Fuses</b></td> <td style="padding: 12px 15px;">0.95</td> <td style="padding: 12px 15px;">1.00</td> <td style="padding: 12px 15px;">0.98</td> <td style="padding: 12px 15px;">42</td> </tr> <tr style="border-bottom: 1px solid #dddddd;"> <td style="padding: 12px 15px;"><b>Lighting</b></td> <td style="padding: 12px 15px;">0.94</td> <td style="padding: 12px 15px;">1.00</td> <td style="padding: 12px 15px;">0.97</td> <td style="padding: 12px 15px;">74</td> </tr> <tr style="border-bottom: 1px solid #dddddd; background-color: #f3f3f3;"> <td style="padding: 12px 15px;"><b>Others</b></td> <td style="padding: 12px 15px;">1.00</td> <td style="padding: 12px 15px;">0.92</td> <td style="padding: 12px 15px;">0.96</td> <td style="padding: 12px 15px;">83</td> </tr> </tbody> <tfoot> <tr style="background-color: #f8f9fa; font-weight: bold; border-top: 2px solid #dddddd;"> <td style="padding: 12px 15px;">Accuracy</td> <td style="padding: 12px 15px;" colspan="3">0.98</td> <td style="padding: 12px 15px;">291</td> </tr> <tr style="background-color: #f8f9fa; color: #666;"> <td style="padding: 12px 15px;">Macro Avg</td> <td style="padding: 12px 15px;">0.97</td> <td style="padding: 12px 15px;">0.98</td> <td style="padding: 12px 15px;">0.98</td> <td style="padding: 12px 15px;">291</td> </tr> <tr style="background-color: #f8f9fa; color: #666;"> <td style="padding: 12px 15px;">Weighted Avg</td> <td style="padding: 12px 15px;">0.98</td> <td style="padding: 12px 15px;">0.98</td> <td style="padding: 12px 15px;">0.98</td> <td style="padding: 12px 15px;">291</td> </tr> </tfoot> </table> </div> </div> </div> """, unsafe_allow_html=True) st.markdown(""" <div style=' background-color: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 5px solid #1f77b4; margin: 20px 0; '> ✨ Perfect performance (1.00) for Cable category<br> 📈 High recall (1.00) across most categories<br> 🎯 Overall accuracy of 98%<br> ⚖️ Balanced performance across all metrics </div> """, unsafe_allow_html=True) # Q6: Metric Selection st.markdown(""" <div class="question-card"> <div class="question">📈 Q6: Why did you choose these particular metrics?</div> <div class="answer"> Our metric selection was driven by the dataset characteristics: <br><br> <b>Key Considerations:</b> <ul> <li>Dataset has mild class imbalance (Imbalance Ratio: 2.36)</li> <li>Need for balanced evaluation across all classes</li> </ul> <br> <b>Selected Metrics:</b> <ul> <li><b>Precision:</b> Critical for minimizing false positives</li> <li><b>Recall:</b> Important for catching all instances of each class</li> <li><b>F1-Score:</b> Provides balanced evaluation of both metrics</li> <li><b>Weighted Average:</b> Accounts for class imbalance</li> </ul> </div> </div> """, unsafe_allow_html=True) # Performance Visualization st.markdown("### 📊 Model Performance Comparison") metrics = { 'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'], 'Baseline': [0.85, 0.83, 0.84, 0.83], 'BERT': [0.98, 0.97, 0.98, 0.98] } df = pd.DataFrame(metrics) fig = px.bar( df, x='Metric', y=['Baseline', 'BERT'], barmode='group', title='Model Performance Comparison', color_discrete_sequence=['#2ecc71', '#3498db'], template='plotly_white' ) fig.update_layout( title_x=0.5, title_font_size=20, legend_title_text='Model Type', xaxis_title="Evaluation Metric", yaxis_title="Score", bargap=0.2, height=500 ) st.plotly_chart(fig, use_container_width=True) main()