Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from transformers import pipeline | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
from collections import Counter | |
import re | |
def analyze_demographics(file): | |
df = pd.read_excel(file.name) | |
results = { | |
"Overall Metrics": {}, | |
"Underrepresented Group Metrics": {}, | |
"Tenure Metrics": {}, | |
"Team Metrics": {}, | |
"Nationality Metrics": {}, | |
"Legal Entity Metrics": {}, | |
"Work Location Metrics": {} | |
} | |
tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"] | |
recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?" | |
if recommend_col in df.columns: | |
promoters = df[recommend_col].apply(lambda x: x >= 9).sum() | |
detractors = df[recommend_col].apply(lambda x: x <= 6).sum() | |
total_respondents = df[recommend_col].notna().sum() | |
recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None | |
recommend_avg = df[recommend_col].mean() | |
results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2) | |
results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2) | |
support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?" | |
if support_col in df.columns: | |
promoters = df[support_col].apply(lambda x: x >= 9).sum() | |
detractors = df[support_col].apply(lambda x: x <= 6).sum() | |
total_respondents = df[support_col].notna().sum() | |
support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None | |
support_avg = df[support_col].mean() | |
results["Overall Metrics"]['Support NPS'] = round(support_nps, 2) | |
results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2) | |
demographic_columns = [ | |
("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"), | |
("How long have you been at Hugging Face? (optional)", "Tenure Metrics"), | |
("Which team are you on here at Hugging Face? (optional)", "Team Metrics"), | |
("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"), | |
("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"), | |
("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics") | |
] | |
for demo_col, demo_category in demographic_columns: | |
if demo_col in df.columns: | |
for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]: | |
if col in df.columns: | |
grouped_demo = df.groupby(demo_col)[col] | |
nps_by_demo = {} | |
for group, scores in grouped_demo: | |
promoters = scores.apply(lambda x: x >= 9).sum() | |
detractors = scores.apply(lambda x: x <= 6).sum() | |
total = scores.notna().sum() | |
nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None | |
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)": | |
sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo} | |
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()} | |
else: | |
results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()} | |
averages_demo = grouped_demo.mean() | |
if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)": | |
sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo} | |
results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()} | |
else: | |
results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict() | |
return results | |
def analyze_why_columns(file): | |
df = pd.read_excel(file.name) | |
# Map column names to new labels | |
column_label_map = { | |
"Why? (optional)": "HF NPS Why?", | |
"Why? (optional.1)": "Support Team NPS Why?", | |
"Why? (optional.2)": "Productivity Why?" | |
} | |
# Rename columns in the DataFrame | |
df = df.rename(columns=column_label_map) | |
# Get the renamed columns that start with "Why" | |
why_columns = [col for col in df.columns if col in column_label_map.values()] | |
results = {} | |
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
for col in why_columns: | |
column_data = df[col].dropna().tolist() | |
# Sentiment Analysis with Confidence Scores | |
sentiments = sentiment_analyzer(column_data) | |
sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0} | |
detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []} | |
for response, sentiment in zip(column_data, sentiments): | |
label = sentiment["label"] | |
score = sentiment["score"] | |
sentiment_summary[label] += 1 | |
detailed_sentiments[label].append({"response": response, "score": round(score, 2)}) | |
# Topic Modeling | |
vectorizer = CountVectorizer(stop_words='english') | |
X = vectorizer.fit_transform(column_data) | |
lda = LatentDirichletAllocation(n_components=3, random_state=0) | |
lda.fit(X) | |
topics = [] | |
for idx, topic in enumerate(lda.components_): | |
top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]] | |
topics.append(f"Topic {idx + 1}: " + ", ".join(top_words)) | |
# Keyword Extraction | |
combined_text = " ".join(column_data) | |
word_list = re.findall(r"\\b\\w+\\b", combined_text.lower()) | |
bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english') | |
bigram_counts = bigram_vectorizer.fit_transform([combined_text]) | |
bigram_features = bigram_vectorizer.get_feature_names_out() | |
bigram_counts_sum = bigram_counts.toarray().sum(axis=0) | |
bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10) | |
keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency] | |
# Summarization | |
def split_text(text, max_length=1000): | |
words = text.split() | |
for i in range(0, len(words), max_length): | |
yield " ".join(words[i:i + max_length]) | |
summaries = [] | |
for chunk in split_text(combined_text, max_length=500): | |
summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] | |
summaries.append(summary) | |
final_summary = " ".join(summaries) | |
# Store results | |
results[col] = { | |
"Sentiment Analysis Summary": sentiment_summary, | |
"Detailed Sentiments": detailed_sentiments, | |
"Topics": topics, | |
"Keywords": keywords, | |
"Summary": final_summary | |
} | |
return results | |
def process_file(file): | |
quantitative_results = analyze_demographics(file) | |
qualitative_results = analyze_why_columns(file) | |
return quantitative_results, qualitative_results | |
def app(): | |
file_input = gr.File(label="Upload Survey Data (Excel format)") | |
text_output = gr.JSON(label="Quantitative Analysis Results") | |
qualitative_output = gr.JSON(label="Qualitative Analysis Results") | |
iface = gr.Interface( | |
fn=process_file, | |
inputs=file_input, | |
outputs=[text_output, qualitative_output], | |
title="Survey Data Analyzer", | |
description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights." | |
) | |
return iface | |
if __name__ == "__main__": | |
app().launch(share=True) | |