Spaces:

EmilyWitko
/

survey-analyzer

Sleeping

Emily Witko

Restored topic modeling, keyword extraction, and summarization

45c8733 19 days ago

8.63 kB

	import gradio as gr
	import pandas as pd
	from transformers import pipeline
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	from collections import Counter
	import re

	def analyze_demographics(file):
	df = pd.read_excel(file.name)

	results = {
	"Overall Metrics": {},
	"Underrepresented Group Metrics": {},
	"Tenure Metrics": {},
	"Team Metrics": {},
	"Nationality Metrics": {},
	"Legal Entity Metrics": {},
	"Work Location Metrics": {}
	}

	tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]

	recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
	if recommend_col in df.columns:
	promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
	detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
	total_respondents = df[recommend_col].notna().sum()
	recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
	recommend_avg = df[recommend_col].mean()
	results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
	results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)

	support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
	if support_col in df.columns:
	promoters = df[support_col].apply(lambda x: x >= 9).sum()
	detractors = df[support_col].apply(lambda x: x <= 6).sum()
	total_respondents = df[support_col].notna().sum()
	support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
	support_avg = df[support_col].mean()
	results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
	results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)

	demographic_columns = [
	("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
	("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
	("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
	("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
	("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
	("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
	]

	for demo_col, demo_category in demographic_columns:
	if demo_col in df.columns:
	for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
	if col in df.columns:
	grouped_demo = df.groupby(demo_col)[col]
	nps_by_demo = {}
	for group, scores in grouped_demo:
	promoters = scores.apply(lambda x: x >= 9).sum()
	detractors = scores.apply(lambda x: x <= 6).sum()
	total = scores.notna().sum()
	nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
	if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
	sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
	results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
	else:
	results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
	averages_demo = grouped_demo.mean()
	if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
	sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
	results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
	else:
	results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()

	return results

	def analyze_why_columns(file):
	df = pd.read_excel(file.name)

	# Map column names to new labels
	column_label_map = {
	"Why? (optional)": "HF NPS Why?",
	"Why? (optional.1)": "Support Team NPS Why?",
	"Why? (optional.2)": "Productivity Why?"
	}

	# Rename columns in the DataFrame
	df = df.rename(columns=column_label_map)

	# Get the renamed columns that start with "Why"
	why_columns = [col for col in df.columns if col in column_label_map.values()]

	results = {}

	sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	for col in why_columns:
	column_data = df[col].dropna().tolist()

	# Sentiment Analysis with Confidence Scores
	sentiments = sentiment_analyzer(column_data)
	sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
	detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}

	for response, sentiment in zip(column_data, sentiments):
	label = sentiment["label"]
	score = sentiment["score"]
	sentiment_summary[label] += 1
	detailed_sentiments[label].append({"response": response, "score": round(score, 2)})

	# Topic Modeling
	vectorizer = CountVectorizer(stop_words='english')
	X = vectorizer.fit_transform(column_data)
	lda = LatentDirichletAllocation(n_components=3, random_state=0)
	lda.fit(X)
	topics = []
	for idx, topic in enumerate(lda.components_):
	top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
	topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))

	# Keyword Extraction
	combined_text = " ".join(column_data)
	word_list = re.findall(r"\\b\\w+\\b", combined_text.lower())
	bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
	bigram_counts = bigram_vectorizer.fit_transform([combined_text])
	bigram_features = bigram_vectorizer.get_feature_names_out()
	bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
	bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
	keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]

	# Summarization
	def split_text(text, max_length=1000):
	words = text.split()
	for i in range(0, len(words), max_length):
	yield " ".join(words[i:i + max_length])

	summaries = []
	for chunk in split_text(combined_text, max_length=500):
	summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
	summaries.append(summary)

	final_summary = " ".join(summaries)

	# Store results
	results[col] = {
	"Sentiment Analysis Summary": sentiment_summary,
	"Detailed Sentiments": detailed_sentiments,
	"Topics": topics,
	"Keywords": keywords,
	"Summary": final_summary
	}

	return results

	def process_file(file):
	quantitative_results = analyze_demographics(file)
	qualitative_results = analyze_why_columns(file)

	return quantitative_results, qualitative_results

	def app():
	file_input = gr.File(label="Upload Survey Data (Excel format)")
	text_output = gr.JSON(label="Quantitative Analysis Results")
	qualitative_output = gr.JSON(label="Qualitative Analysis Results")

	iface = gr.Interface(
	fn=process_file,
	inputs=file_input,
	outputs=[text_output, qualitative_output],
	title="Survey Data Analyzer",
	description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
	)
	return iface

	if __name__ == "__main__":
	app().launch(share=True)