survey-analyzer / app.py
Emily Witko
Restored topic modeling, keyword extraction, and summarization
45c8733
import gradio as gr
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import re
def analyze_demographics(file):
df = pd.read_excel(file.name)
results = {
"Overall Metrics": {},
"Underrepresented Group Metrics": {},
"Tenure Metrics": {},
"Team Metrics": {},
"Nationality Metrics": {},
"Legal Entity Metrics": {},
"Work Location Metrics": {}
}
tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]
recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
if recommend_col in df.columns:
promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
total_respondents = df[recommend_col].notna().sum()
recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
recommend_avg = df[recommend_col].mean()
results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)
support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
if support_col in df.columns:
promoters = df[support_col].apply(lambda x: x >= 9).sum()
detractors = df[support_col].apply(lambda x: x <= 6).sum()
total_respondents = df[support_col].notna().sum()
support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
support_avg = df[support_col].mean()
results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)
demographic_columns = [
("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
]
for demo_col, demo_category in demographic_columns:
if demo_col in df.columns:
for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
if col in df.columns:
grouped_demo = df.groupby(demo_col)[col]
nps_by_demo = {}
for group, scores in grouped_demo:
promoters = scores.apply(lambda x: x >= 9).sum()
detractors = scores.apply(lambda x: x <= 6).sum()
total = scores.notna().sum()
nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
else:
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
averages_demo = grouped_demo.mean()
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
else:
results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()
return results
def analyze_why_columns(file):
df = pd.read_excel(file.name)
# Map column names to new labels
column_label_map = {
"Why? (optional)": "HF NPS Why?",
"Why? (optional.1)": "Support Team NPS Why?",
"Why? (optional.2)": "Productivity Why?"
}
# Rename columns in the DataFrame
df = df.rename(columns=column_label_map)
# Get the renamed columns that start with "Why"
why_columns = [col for col in df.columns if col in column_label_map.values()]
results = {}
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
for col in why_columns:
column_data = df[col].dropna().tolist()
# Sentiment Analysis with Confidence Scores
sentiments = sentiment_analyzer(column_data)
sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}
for response, sentiment in zip(column_data, sentiments):
label = sentiment["label"]
score = sentiment["score"]
sentiment_summary[label] += 1
detailed_sentiments[label].append({"response": response, "score": round(score, 2)})
# Topic Modeling
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(column_data)
lda = LatentDirichletAllocation(n_components=3, random_state=0)
lda.fit(X)
topics = []
for idx, topic in enumerate(lda.components_):
top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))
# Keyword Extraction
combined_text = " ".join(column_data)
word_list = re.findall(r"\\b\\w+\\b", combined_text.lower())
bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
bigram_counts = bigram_vectorizer.fit_transform([combined_text])
bigram_features = bigram_vectorizer.get_feature_names_out()
bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]
# Summarization
def split_text(text, max_length=1000):
words = text.split()
for i in range(0, len(words), max_length):
yield " ".join(words[i:i + max_length])
summaries = []
for chunk in split_text(combined_text, max_length=500):
summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
summaries.append(summary)
final_summary = " ".join(summaries)
# Store results
results[col] = {
"Sentiment Analysis Summary": sentiment_summary,
"Detailed Sentiments": detailed_sentiments,
"Topics": topics,
"Keywords": keywords,
"Summary": final_summary
}
return results
def process_file(file):
quantitative_results = analyze_demographics(file)
qualitative_results = analyze_why_columns(file)
return quantitative_results, qualitative_results
def app():
file_input = gr.File(label="Upload Survey Data (Excel format)")
text_output = gr.JSON(label="Quantitative Analysis Results")
qualitative_output = gr.JSON(label="Qualitative Analysis Results")
iface = gr.Interface(
fn=process_file,
inputs=file_input,
outputs=[text_output, qualitative_output],
title="Survey Data Analyzer",
description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
)
return iface
if __name__ == "__main__":
app().launch(share=True)