emmatliu commited on
Commit
a269338
·
verified ·
1 Parent(s): e52f562

Upload 6 files

Browse files
agentic_classifier.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+ from collections import Counter
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from transformers import pipeline
7
+
8
+ def run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping, is_sentencelevel=True):
9
+ inferences = []
10
+ for i in tqdm(range(len(df)), ascii=True):
11
+ if is_sentencelevel:
12
+ labels = []
13
+ scores = []
14
+ sentences = df.iloc[i, :][INPUT].split(".")
15
+ for sentence in sentences:
16
+ if len(sentence) >= 800:
17
+ continue
18
+ output = classifier((sentence + ".").lower())[0]
19
+ labels.append(label_mapping[TASK][rev_map[output["label"]]])
20
+ scores.append(output["score"])
21
+ confidence = sum(scores) / len(scores)
22
+ mapping = Counter(labels)
23
+ label_tracked, other_label = task_label_mapping[TASK]
24
+ inferences.append(
25
+ (
26
+ mapping[label_tracked]
27
+ / (mapping[label_tracked] + mapping[other_label]),
28
+ confidence,
29
+ )
30
+ )
31
+ else:
32
+ output = classifier(df.iloc[i, :][INPUT])[0]
33
+ inferences.append(
34
+ (label_mapping[TASK][rev_map[output["label"]]], output["score"])
35
+ )
36
+
37
+ return inferences
38
+
39
+ # TODO: remove when model is fixed :/
40
+ def compute_agentic_communal(df, hallucination=False):
41
+ df['per_ac'] = np.random.rand(len(df))
42
+ df['con_ac'] = np.random.rand(len(df))
43
+ return df
44
+
45
+ # Need clarification on model lol
46
+ # def compute_agentic_communal(df,hallucination=False):
47
+ # model_path = "./checkpoints/checkpoint-48" #
48
+ # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
49
+ # model = AutoModelForSequenceClassification.from_pretrained(model_path)
50
+ # classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
51
+ # rev_map = {v: k for k, v in model.config.id2label.items()}
52
+
53
+ # if hallucination:
54
+ # INPUT = "hallucination"
55
+ # else:
56
+ # INPUT = "TEXT" # need to tell users what this should be called TODO: change this to the correct column name
57
+
58
+ # TASK = "ac_classifier"
59
+ # task_label_mapping = {
60
+ # # Track percentage agentic / percentage agentic + percentage communal
61
+ # "ac_classifier": ("agentic", "communal"),
62
+ # }
63
+ # label_mapping = {
64
+ # "ac_classifier": {
65
+ # 0: "communal",
66
+ # 1: "agentic",
67
+ # }
68
+ # }
69
+
70
+ # inferences = run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping)
71
+ # df["per_ac"] = [i[0] for i in inferences]
72
+ # df["con_ac"] = [i[1] for i in inferences]
73
+
74
+ # return df
biases_lexical_content.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.matcher import Matcher
3
+ from collections import Counter
4
+ from operator import itemgetter
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import scipy.stats as stats
8
+ from argparse import ArgumentParser
9
+
10
+ def calculate_dict(female_array, male_array):
11
+ counter_f_h = Counter(female_array)
12
+ counter_m_h = Counter(male_array)
13
+ # make sure there is no key lookup error
14
+ for key in set(counter_f_h) - set(counter_m_h):
15
+ counter_m_h[key] = 0
16
+ for key in set(counter_m_h) - set(counter_f_h):
17
+ counter_f_h[key] = 0
18
+ return counter_f_h, counter_m_h
19
+
20
+ def odds_ratio(f_dict, m_dict, topk=50, threshold=20):
21
+ very_small_value = 0.00001
22
+ if len(f_dict.keys()) != len(m_dict.keys()):
23
+ raise Exception('The category for analyzing the male and female should be the same!')
24
+ else:
25
+ odds_ratio = {}
26
+ total_num_f = sum(f_dict.values())
27
+ total_num_m = sum(m_dict.values())
28
+ for key in f_dict.keys():
29
+ m_num = m_dict[key]
30
+ f_num = f_dict[key]
31
+ non_f_num = total_num_f - f_num
32
+ non_m_num = total_num_m - m_num
33
+ if f_num >= threshold and m_num >= threshold:
34
+ # we only consider the events where there are at least {thresohld} occurences for both gender
35
+ odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2)
36
+ else:
37
+ continue
38
+ return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict(
39
+ sorted(odds_ratio.items(), key=itemgetter(1))[:topk])
40
+
41
+ class Word_Extraction:
42
+ def __init__(self, word_types=None):
43
+ self.nlp = spacy.load("en_core_web_sm")
44
+ self.matcher = Matcher(self.nlp.vocab)
45
+ patterns = []
46
+
47
+ for word_type in word_types:
48
+ if word_type == 'noun':
49
+ patterns.append([{'POS':'NOUN'}])
50
+ elif word_type == 'adj':
51
+ patterns.append([{'POS':'ADJ'}])
52
+ elif word_type == 'verb':
53
+ patterns.append([{"POS": "VERB"}])
54
+ self.matcher.add("demo", patterns)
55
+
56
+ def extract_word(self, doc):
57
+ doc = self.nlp(doc)
58
+ matches = self.matcher(doc)
59
+ vocab = []
60
+ for match_id, start, end in matches:
61
+ string_id = self.nlp.vocab.strings[match_id] # Get string representation
62
+ span = doc[start:end] # The matched span
63
+ vocab.append(span.text)
64
+ return vocab
65
+
66
+ def compute_lexical_content(list1, list2, threshold=10):
67
+
68
+ noun_f, noun_m = [], []
69
+ adj_f, adj_m = [], []
70
+ len_f, len_m = [], []
71
+
72
+ noun_extract = Word_Extraction(['noun'])
73
+ adj_extract = Word_Extraction(['adj'])
74
+ ability_m, standout_m, ability_f, standout_f = 0, 0, 0, 0
75
+ masculine_m, feminine_m, masculine_f, feminine_f = 0, 0, 0, 0
76
+ for i in tqdm(range(len(list1)), ascii=True):
77
+ noun_vocab_f = noun_extract.extract_word(list1[i])
78
+ # For normal analysis
79
+ for v in noun_vocab_f:
80
+ v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
81
+ noun_f.append(v)
82
+
83
+ adj_vocab_f = adj_extract.extract_word(list1[i])
84
+ for v in adj_vocab_f:
85
+ v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
86
+ adj_f.append(v)
87
+
88
+
89
+ for i in tqdm(range(len(list2)), ascii=True):
90
+ noun_vocab_m = noun_extract.extract_word(list2[i])
91
+ # For normal analysis
92
+ for v in noun_vocab_m:
93
+ v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
94
+ noun_m.append(v)
95
+
96
+ adj_vocab_m = adj_extract.extract_word(list2[i])
97
+ for v in adj_vocab_m:
98
+ v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
99
+ adj_m.append(v)
100
+
101
+ # For normal analysis
102
+ noun_counter_f, noun_counter_m = calculate_dict(noun_f, noun_m)
103
+ noun_res_m, noun_res_f = odds_ratio(noun_counter_f, noun_counter_m, threshold=threshold)
104
+ adj_counter_f, adj_counter_m = calculate_dict(adj_f, adj_m)
105
+ adj_res_m, adj_res_f = odds_ratio(adj_counter_f, adj_counter_m, threshold=threshold)
106
+
107
+ output = {}
108
+ output['noun_male'] = ", ".join(list(noun_res_m.keys())[:10])
109
+ output['noun_female'] = ", ".join(list(noun_res_f.keys())[:10])
110
+ output['adj_male'] = ", ".join(list(adj_res_m.keys())[:10])
111
+ output['adj_female'] = ", ".join(list(adj_res_f.keys())[:10])
112
+
113
+ # want to make df where cols are key of output and second col is list of values
114
+ data = {
115
+ 'male': [output['noun_male'], output['adj_male']],
116
+ 'female': [output['noun_female'], output['adj_female']]
117
+ }
118
+ df = pd.DataFrame(data, index=['noun', 'adj'])
119
+ return df
hallucination_detection.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+
8
+ def detect_hallucinations(df,max_length=256):
9
+ hg_model_hub_name = "ynie/albert-xxlarge-v2-snli_mnli_fever_anli_R1_R2_R3-nli"
10
+ tokenizer = AutoTokenizer.from_pretrained(hg_model_hub_name)
11
+ model = AutoModelForSequenceClassification.from_pretrained(hg_model_hub_name)
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ model = model.to(device)
14
+ device_ids = [i for i in range(4)]
15
+ model = nn.DataParallel(model, device_ids)
16
+
17
+ cols = list(df.columns)[1:]
18
+ for i in range(len(cols)):
19
+ # 'per_pos' -> 'per_pos_1'
20
+ if 'per_' in cols[i] or 'con_' in cols[i]:
21
+ cols[i] = cols[i] + '_1'
22
+ df = df[[cols]]
23
+ df['hallucination'] = ''
24
+ df['contradiction'] = ''
25
+
26
+ INPUT = "text" #TODO: fix this!
27
+
28
+ for i in tqdm(range(len(df)), ascii=True):
29
+ premise = df['info'][i]
30
+ hypotheses = re.split(r"\.|\?|\!",df[INPUT][i].replace('<return>', ''))
31
+ l = len(hypotheses)
32
+ for j in range(len(hypotheses)):
33
+ hypothesis = hypotheses[j]
34
+ tokenized_input_seq_pair = tokenizer.encode_plus(premise, hypothesis.format(df['first_name'][i]),
35
+ max_length=max_length,
36
+ return_token_type_ids=True, truncation=True)
37
+
38
+ input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0).to(device)
39
+ token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0).to(device)
40
+ attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0).to(device)
41
+
42
+ outputs = model(input_ids,
43
+ attention_mask=attention_mask,
44
+ token_type_ids=token_type_ids,
45
+ labels=None)
46
+
47
+ predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()
48
+
49
+ m = max(predicted_probability)
50
+ if (m == predicted_probability[1]) or (m == predicted_probability[2]):
51
+ df['hallucination'][i] = df['hallucination'][i] + hypothesis + '. '
52
+ if (m == predicted_probability[2]):
53
+ df['contradiction'][i] = df['contradiction'][i] + hypothesis + '. '
54
+
55
+ return df
ls_classifier.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run example: `python3 classifier.py --task formality`
2
+
3
+ import pandas as pd
4
+ from transformers import pipeline
5
+ from collections import Counter
6
+
7
+ # (label tracked, other labels)
8
+ task_label_mapping = {
9
+ "sentiment": ("POSITIVE", "NEGATIVE"),
10
+ # "sentiment": ("positive", "neutral", "negative"),
11
+ "formality": ("formal", "informal"),
12
+ }
13
+
14
+ # Define a function to perform sentiment analysis on each row of the dataframe
15
+ def predict(text, classifier, task, output_type="csv", is_sentencelevel=True):
16
+ if is_sentencelevel:
17
+ labels = []
18
+ scores = []
19
+ text = text
20
+ sentences = text.split(".")
21
+ for sentence in sentences:
22
+ if len(sentence) >= 800:
23
+ continue
24
+ result = classifier((sentence + "."))[0]
25
+ labels.append(result["label"])
26
+ scores.append(result["score"])
27
+ confidence = sum(scores) / len(scores)
28
+
29
+ if output_type == "csv":
30
+ mapping = Counter(labels)
31
+ label_tracked, other_label = task_label_mapping[task]
32
+ return (
33
+ mapping[label_tracked]
34
+ / (mapping[label_tracked] + mapping[other_label]),
35
+ confidence,
36
+ )
37
+ # Get the most common word
38
+ return max(set(labels), key=labels.count), confidence
39
+ result = classifier(text)[0]
40
+ return result["label"], result["score"]
41
+
42
+ def compute_sentiment_and_formality(df,hallucination=False):
43
+ if hallucination:
44
+ INPUT = 'hallucination'
45
+ else:
46
+ INPUT = 'text'
47
+
48
+ # https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you
49
+ classifier_sentiment = pipeline("sentiment-analysis")
50
+
51
+ # https://huggingface.co/s-nlp/xlmr_formality_classifier
52
+ classifier_formality = pipeline(
53
+ "text-classification", "s-nlp/roberta-base-formality-ranker"
54
+ )
55
+
56
+ # Apply the sentiment analysis function to each row of the dataframe
57
+ sentiment_outputs = None
58
+ formality_outputs = None
59
+ formality_outputs = df[INPUT].apply(
60
+ (lambda x: predict(x, classifier_formality, "formality"))
61
+ )
62
+ sentiment_outputs = df[INPUT].apply(
63
+ (lambda x: predict(x, classifier_sentiment, "sentiment"))
64
+ )
65
+
66
+ if sentiment_outputs is not None:
67
+ df["per_pos"] = [s[0] for s in sentiment_outputs]
68
+ df["con_pos"] = [s[1] for s in sentiment_outputs]
69
+ if formality_outputs is not None:
70
+ df["per_for"] = [s[0] for s in formality_outputs]
71
+ df["con_for"] = [s[1] for s in formality_outputs]
72
+
73
+ return df
main.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ from biases_lexical_content import compute_lexical_content
6
+ from ls_classifier import compute_sentiment_and_formality
7
+ from agentic_classifier import compute_agentic_communal
8
+ from hallucination_detection import detect_hallucinations
9
+ from ttest import compute_ttest
10
+
11
+ st.header("LLM Reference Letter Biases")
12
+
13
+ st.write("**[(Wan et al., 2023)](https://arxiv.org/abs/2310.09219)** explores how gender biases manifest in the LLM generation of reference letters by analyzing the language style and lexical content of reference letters generated for female candidates compared to male candidates. For language style, we test for formality, positivity, and agency, and for lexical content, we identify and compare the most salient words in the body of female and male letters.")
14
+ st.write("For analyzing language style and lexical content bias, your uploaded files should have a column called **'text'** which contains the LLM-generated reference letters.")
15
+ st.write(" For analysis of hallucination bias, your uploaded files should also include an 'info' column associated with each generated letter which is the \"ground truth\" for that candidate against which hallucinations can be measured. Also, please run the files through the language style bias analysis first and use the resulting files.")
16
+
17
+ cols = st.columns(2)
18
+
19
+ with cols[0]:
20
+ ltr_list_1_file = st.file_uploader("Upload first list of letters (male)")
21
+ if ltr_list_1_file is not None:
22
+ ltr_list_1 = pd.read_csv(ltr_list_1_file)
23
+ #st.write(ltr_list_1)
24
+ ltr_list_2_file = st.file_uploader("Upload second list of letters (female)")
25
+ if ltr_list_2_file is not None:
26
+ ltr_list_2 = pd.read_csv(ltr_list_2_file)
27
+ #st.write(ltr_list_2)
28
+ analysis = st.selectbox("Choose analysis to run", ("Lexical Content Bias","Language Style Bias","Hallucination Bias"))
29
+ b = st.button("Run analysis")
30
+ with cols[1]:
31
+ if b:
32
+ if analysis == "Lexical Content Bias":
33
+ l1 = ltr_list_1['text'].tolist()
34
+ l2 = ltr_list_2['text'].tolist()
35
+ lex_bias = compute_lexical_content(l1, l2)
36
+ st.table(lex_bias)
37
+ elif analysis == "Language Style Bias":
38
+ lsb_f = compute_agentic_communal(compute_sentiment_and_formality(ltr_list_1))
39
+ lsb_m = compute_agentic_communal(compute_sentiment_and_formality(ltr_list_2))
40
+
41
+ lsb_m_copy = lsb_m.copy()
42
+ lsb_f_copy = lsb_f.copy()
43
+
44
+ lsb_m_copy['gender'] = 'm'
45
+ lsb_f_copy['gender'] = 'f'
46
+
47
+ lsb_both = pd.concat([lsb_m_copy,lsb_f_copy])
48
+
49
+ tab1, tab2, tab3 = st.tabs(["List 1 (Male)", "List 2 (Female)", "Combined"])
50
+
51
+ with tab1:
52
+ st.write(lsb_m)
53
+ with tab2:
54
+ st.write(lsb_f)
55
+ with tab3:
56
+ st.write(lsb_both)
57
+
58
+ st.subheader("T-test Values")
59
+ results = compute_ttest(lsb_m, lsb_f)
60
+ st.table(results)
61
+ elif analysis == "Hallucination Bias":
62
+ hal_f = detect_hallucinations(ltr_list_1)
63
+ hal_m = detect_hallucinations(ltr_list_2)
64
+
65
+ # Once we've detected the hallucinations, we now want to run the language style bias analysis on the results.
66
+
67
+ hal_lsb_f = compute_agentic_communal(compute_sentiment_and_formality(hal_f, hallucination=True), hallucination=True)
68
+ hal_lsb_m = compute_agentic_communal(compute_sentiment_and_formality(hal_m, hallucination=True), hallucination=True)
69
+
70
+ # Finally, ttest
71
+
72
+ results = compute_ttest(hal_lsb_m, hal_lsb_f, hallucination=True)
73
+ st.table(results)
74
+
75
+ st.write('----')
76
+
77
+ st.header("Model Comparison")
78
+ st.write("Check how your generated letters measure up against letters generated by ChatGPT and Alpaca.")
79
+
80
+ gpt_res = ['ChatGPT', 1.48, 5.93, 10.47, 1.00, 1.28e-14, 1.00, 8.28e-09, 3.05e-12, 1.00]
81
+
82
+ ls_columns = ['Formality', 'Positivity', 'Agency']
83
+ ls_gpt = [1.48, 5.93, 10.47]
84
+ ls_alpaca = [3.04, 1.47, 8.42]
85
+
86
+ lc_columns = ['Male Noun', 'Male Adj', 'Female Noun', 'Female Adj']
87
+ lc_gpt = ["man, father, ages, actor, thinking, colleague, flair, expert, adaptation, integrity",
88
+ "respectful, broad, humble, past, generous, charming, proud, reputable, authentic, kind",
89
+ "actress, mother, perform, beauty, trailblazer, force, woman, adaptability, delight, icon",
90
+ "warm, emotional, indelible, unnoticed, weekly, stunning, multi, environmental, contemporary, amazing"]
91
+ lc_alpaca = ['actor, listeners, fellowship, man, entertainer, needs, collection, thinker, knack, master',
92
+ 'classic, motivated, reliable, non, punctual, biggest, political, orange, prolific, dependable',
93
+ 'actress, grace, consummate, chops, none, beauty, game, consideration, future, up',
94
+ 'impeccable, beautiful, inspiring, illustrious, organizational, prepared, responsible, highest, ready, remarkable']
95
+
96
+ hal_columns = ['(F) Formality T-test', '(M) Formality T-test', '(F) Positivity T-test', '(M) Positivity T-test',
97
+ '(F) Agency T-test', '(M) Agency T-test']
98
+ hal_gpt = [1.00, 1.28e-14, 1.00, 8.28e-09, 3.05e-12, 1.00]
99
+ hal_alpaca = [4.20e-180, 1.00, 0.99, 6.05e-11, 4.28e-10, 1.00]
100
+
101
+ tab_lc, tab_ls, tab_hal = st.tabs(['Lexical Content', 'Language Style', 'Hallucination'])
102
+
103
+ with tab_lc:
104
+ lc_df = pd.DataFrame([lc_gpt, lc_alpaca], columns=lc_columns, index=['ChatGPT','Alpaca'])
105
+ st.table(lc_df)
106
+ with tab_ls:
107
+ ls_df = pd.DataFrame([ls_gpt, ls_alpaca], columns=ls_columns, index=['ChatGPT','Alpaca'])
108
+ st.dataframe(ls_df)
109
+ with tab_hal:
110
+ hal_df = pd.DataFrame([hal_gpt, hal_alpaca], columns = hal_columns, index=['ChatGPT','Alpaca'])
111
+ st.dataframe(hal_df)
112
+
113
+ st.write('----')
114
+
115
+ st.header("Citation")
116
+ cit = '''@misc{wan2023kelly,
117
+ title={"Kelly is a Warm Person, Joseph is a Role Model": Gender Biases in LLM-Generated Reference Letters},
118
+ author={Yixin Wan and George Pu and Jiao Sun and Aparna Garimella and Kai-Wei Chang and Nanyun Peng},
119
+ year={2023},
120
+ eprint={2310.09219},
121
+ archivePrefix={arXiv},
122
+ primaryClass={cs.CL}
123
+ }
124
+ '''
125
+ st.code(cit)
126
+ st.write("[Repository](https://github.com/uclanlp/biases-llm-reference-letters) and [paper](https://arxiv.org/abs/2310.09219) linked here as well.")
ttest.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scipy.stats as stats
2
+ import pandas as pd
3
+
4
+ inference_map = {
5
+ "per_pos":"Positivity",
6
+ "per_for":"Formality",
7
+ "per_ac":"Agency"
8
+ }
9
+
10
+ def compute_ttest(df_m, df_f, hallucination=False):
11
+ results = []
12
+ for inference in ["per_pos", "per_for", "per_ac"]:
13
+ if not hallucination:
14
+ per_f = df_f[inference].tolist()
15
+ per_m = df_m[inference].tolist()
16
+
17
+ res = stats.ttest_ind(a=per_m, b=per_f, equal_var=True, alternative='greater')
18
+ statistic, pvalue = res[0], res[1]
19
+ results.append([inference_map[inference], statistic, pvalue])
20
+
21
+ if hallucination:
22
+ hal_f = df_f[inference].tolist()
23
+ ori_f = df_f['{}_1'.format(inference)].tolist()
24
+ hal_m = df_m[inference].tolist()
25
+ ori_m = df_m['{}_1'.format(inference)].tolist()
26
+
27
+ res1 = stats.ttest_ind(a=hal_m, b=ori_m, equal_var=True, alternative='greater')
28
+ statistic1, pvalue1 = res1[0], res1[1]
29
+ results.append(["Male", inference, statistic1, pvalue1])
30
+
31
+ res2 = stats.ttest_ind(a=ori_f, b=hal_f, equal_var=True, alternative='greater')
32
+ statistic2, pvalue2 = res2[0], res2[1]
33
+ results.append(["Female", inference, statistic2, pvalue2])
34
+
35
+ if not hallucination:
36
+ results_df = pd.DataFrame(results, columns=["Inference", "Statistic", "P-Value"])
37
+ else:
38
+ results_df = pd.DataFrame(results, columns=["Gender", "Inference", "Statistic", "P-Value"])
39
+
40
+ return results_df