File size: 6,235 Bytes
a269338
 
 
 
 
 
 
66afe1a
a269338
 
 
 
 
 
66afe1a
a269338
 
 
 
 
 
 
 
 
 
 
 
66afe1a
a269338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66afe1a
 
 
a269338
66afe1a
a269338
66afe1a
 
a269338
66afe1a
a269338
66afe1a
 
a269338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66afe1a
 
 
 
a269338
66afe1a
a269338
 
 
 
 
 
 
66afe1a
 
 
a269338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import streamlit as st
import pandas as pd
import numpy as np

from biases_lexical_content import compute_lexical_content
from ls_classifier import compute_sentiment_and_formality
from agentic_classifier import compute_agentic_communal
# from hallucination_detection import detect_hallucinations
from ttest import compute_ttest

st.header("LLM Reference Letter Biases")

st.write("**[(Wan et al., 2023)](https://arxiv.org/abs/2310.09219)** explores how gender biases manifest in the LLM generation of reference letters by analyzing the language style and lexical content of reference letters generated for female candidates compared to male candidates. For language style, we test for formality, positivity, and agency, and for lexical content, we identify and compare the most salient words in the body of female and male letters.")
st.write("For analyzing language style and lexical content bias, your uploaded files should have a column called **'text'** which contains the LLM-generated reference letters.")
st.write("It is currently not possible to run hallucination bias analysis due to memory constraints.  Please see the [GitHub repository](https://github.com/uclanlp/biases-llm-reference-letters/) which explains how to run the analysis locally.")

cols = st.columns(2)

with cols[0]:
    ltr_list_1_file = st.file_uploader("Upload first list of letters (male)")
    if ltr_list_1_file is not None:
        ltr_list_1 = pd.read_csv(ltr_list_1_file)
        #st.write(ltr_list_1)
    ltr_list_2_file = st.file_uploader("Upload second list of letters (female)")
    if ltr_list_2_file is not None:
        ltr_list_2 = pd.read_csv(ltr_list_2_file)
        #st.write(ltr_list_2)
    analysis = st.selectbox("Choose analysis to run", ("Lexical Content Bias","Language Style Bias"))
    b = st.button("Run analysis")
with cols[1]:
    if b:
        if analysis == "Lexical Content Bias":
            l1 = ltr_list_1['text'].tolist()
            l2 = ltr_list_2['text'].tolist()
            lex_bias = compute_lexical_content(l1, l2)
            st.table(lex_bias)
        elif analysis == "Language Style Bias":
            lsb_f = compute_agentic_communal(compute_sentiment_and_formality(ltr_list_1))
            lsb_m = compute_agentic_communal(compute_sentiment_and_formality(ltr_list_2))
            
            lsb_m_copy = lsb_m.copy()
            lsb_f_copy = lsb_f.copy()

            lsb_m_copy['gender'] = 'm'
            lsb_f_copy['gender'] = 'f'

            lsb_both = pd.concat([lsb_m_copy,lsb_f_copy])
            
            tab1, tab2, tab3 = st.tabs(["List 1 (Male)", "List 2 (Female)", "Combined"])
            
            with tab1:
                st.write(lsb_m)
            with tab2:
                st.write(lsb_f)
            with tab3:
                st.write(lsb_both)

            st.subheader("T-test Values")
            results = compute_ttest(lsb_m, lsb_f)
            st.table(results)
        # elif analysis == "Hallucination Bias":
        #     hal_f = detect_hallucinations(ltr_list_1)
        #     hal_m = detect_hallucinations(ltr_list_2)

        #     # Once we've detected the hallucinations, we now want to run the language style bias analysis on the results.

        #     hal_lsb_f = compute_agentic_communal(compute_sentiment_and_formality(hal_f, hallucination=True), hallucination=True)
        #     hal_lsb_m = compute_agentic_communal(compute_sentiment_and_formality(hal_m, hallucination=True), hallucination=True)

        #     # Finally, ttest

        #     results = compute_ttest(hal_lsb_m, hal_lsb_f, hallucination=True)
        #     st.table(results)

st.write('----')

st.header("Model Comparison")
st.write("Check how your generated letters measure up against letters generated by ChatGPT and Alpaca.")

gpt_res = ['ChatGPT', 1.48, 5.93, 10.47, 1.00, 1.28e-14, 1.00, 8.28e-09, 3.05e-12, 1.00]

ls_columns = ['Formality', 'Positivity', 'Agency']
ls_gpt = [1.48, 5.93, 10.47]
ls_alpaca = [3.04, 1.47, 8.42]

lc_columns = ['Male Noun', 'Male Adj', 'Female Noun', 'Female Adj']
lc_gpt = ["man, father, ages, actor, thinking, colleague, flair, expert, adaptation, integrity",
          "respectful, broad, humble, past, generous, charming, proud, reputable, authentic, kind",
          "actress, mother, perform, beauty, trailblazer, force, woman, adaptability, delight, icon",
          "warm, emotional, indelible, unnoticed, weekly, stunning, multi, environmental, contemporary, amazing"]
lc_alpaca = ['actor, listeners, fellowship, man, entertainer, needs, collection, thinker, knack, master',
             'classic, motivated, reliable, non, punctual, biggest, political, orange, prolific, dependable',
             'actress, grace, consummate, chops, none, beauty, game, consideration, future, up',
             'impeccable, beautiful, inspiring, illustrious, organizational, prepared, responsible, highest, ready, remarkable']

# hal_columns = ['(F) Formality T-test', '(M) Formality T-test', '(F) Positivity T-test', '(M) Positivity T-test',
#                '(F) Agency T-test', '(M) Agency T-test']
# hal_gpt = [1.00, 1.28e-14, 1.00, 8.28e-09, 3.05e-12, 1.00]
# hal_alpaca = [4.20e-180, 1.00, 0.99, 6.05e-11, 4.28e-10, 1.00]

tab_lc, tab_ls = st.tabs(['Lexical Content', 'Language Style'])

with tab_lc:
    lc_df = pd.DataFrame([lc_gpt, lc_alpaca], columns=lc_columns, index=['ChatGPT','Alpaca'])
    st.table(lc_df)
with tab_ls:
    ls_df = pd.DataFrame([ls_gpt, ls_alpaca], columns=ls_columns, index=['ChatGPT','Alpaca'])
    st.dataframe(ls_df)
# with tab_hal:
#     hal_df = pd.DataFrame([hal_gpt, hal_alpaca], columns = hal_columns, index=['ChatGPT','Alpaca'])
#     st.dataframe(hal_df)

st.write('----')

st.header("Citation")
cit = '''@misc{wan2023kelly,
      title={"Kelly is a Warm Person, Joseph is a Role Model": Gender Biases in LLM-Generated Reference Letters}, 
      author={Yixin Wan and George Pu and Jiao Sun and Aparna Garimella and Kai-Wei Chang and Nanyun Peng},
      year={2023},
      eprint={2310.09219},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
'''
st.code(cit)
st.write("[Repository](https://github.com/uclanlp/biases-llm-reference-letters) and [paper](https://arxiv.org/abs/2310.09219) linked here as well.")