File size: 4,779 Bytes
a269338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import spacy
from spacy.matcher import Matcher
from collections import Counter
from operator import itemgetter
import pandas as pd
from tqdm import tqdm
import scipy.stats as stats
from argparse import ArgumentParser

def calculate_dict(female_array, male_array):
    counter_f_h = Counter(female_array)
    counter_m_h = Counter(male_array)
    # make sure there is no key lookup error
    for key in set(counter_f_h) - set(counter_m_h):
        counter_m_h[key] = 0
    for key in set(counter_m_h) - set(counter_f_h):
        counter_f_h[key] = 0
    return counter_f_h, counter_m_h

def odds_ratio(f_dict, m_dict, topk=50, threshold=20):
    very_small_value = 0.00001
    if len(f_dict.keys()) != len(m_dict.keys()):
        raise Exception('The category for analyzing the male and female should be the same!')
    else:
        odds_ratio = {}
        total_num_f = sum(f_dict.values())
        total_num_m = sum(m_dict.values())
        for key in f_dict.keys():
            m_num = m_dict[key]
            f_num = f_dict[key]
            non_f_num = total_num_f - f_num
            non_m_num = total_num_m - m_num
            if f_num >= threshold and m_num >= threshold:
                # we only consider the events where there are at least {thresohld} occurences for both gender
                odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2)
            else:
                continue
        return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict(
            sorted(odds_ratio.items(), key=itemgetter(1))[:topk])

class Word_Extraction:
    def __init__(self, word_types=None):
        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)
        patterns = []

        for word_type in word_types:
            if word_type == 'noun':
                patterns.append([{'POS':'NOUN'}])
            elif word_type == 'adj':
                patterns.append([{'POS':'ADJ'}])
            elif word_type == 'verb':
                patterns.append([{"POS": "VERB"}])
        self.matcher.add("demo", patterns)

    def extract_word(self, doc):
        doc = self.nlp(doc)
        matches = self.matcher(doc)
        vocab = []
        for match_id, start, end in matches:
            string_id = self.nlp.vocab.strings[match_id]  # Get string representation
            span = doc[start:end]  # The matched span
            vocab.append(span.text)
        return vocab
    
def compute_lexical_content(list1, list2, threshold=10):
    
    noun_f, noun_m = [], []
    adj_f, adj_m = [], []
    len_f, len_m = [], []

    noun_extract = Word_Extraction(['noun'])
    adj_extract = Word_Extraction(['adj'])
    ability_m, standout_m, ability_f, standout_f = 0, 0, 0, 0
    masculine_m, feminine_m, masculine_f, feminine_f = 0, 0, 0, 0
    for i in tqdm(range(len(list1)), ascii=True):
        noun_vocab_f = noun_extract.extract_word(list1[i])
        # For normal analysis
        for v in noun_vocab_f:
            v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
            noun_f.append(v)
        
        adj_vocab_f = adj_extract.extract_word(list1[i])
        for v in adj_vocab_f:
            v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
            adj_f.append(v)


    for i in tqdm(range(len(list2)), ascii=True):
        noun_vocab_m = noun_extract.extract_word(list2[i])
        # For normal analysis
        for v in noun_vocab_m:
            v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
            noun_m.append(v)
        
        adj_vocab_m = adj_extract.extract_word(list2[i])
        for v in adj_vocab_m:
            v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower()
            adj_m.append(v)

    # For normal analysis
    noun_counter_f, noun_counter_m = calculate_dict(noun_f, noun_m)
    noun_res_m, noun_res_f = odds_ratio(noun_counter_f, noun_counter_m, threshold=threshold)
    adj_counter_f, adj_counter_m = calculate_dict(adj_f, adj_m)
    adj_res_m, adj_res_f = odds_ratio(adj_counter_f, adj_counter_m, threshold=threshold)

    output = {}
    output['noun_male'] = ", ".join(list(noun_res_m.keys())[:10])
    output['noun_female'] = ", ".join(list(noun_res_f.keys())[:10])
    output['adj_male'] = ", ".join(list(adj_res_m.keys())[:10])
    output['adj_female'] = ", ".join(list(adj_res_f.keys())[:10])

    # want to make df where cols are key of output and second col is list of values
    data = {
        'male': [output['noun_male'], output['adj_male']],
        'female': [output['noun_female'], output['adj_female']]
    }
    df = pd.DataFrame(data, index=['noun', 'adj'])
    return df