File size: 18,113 Bytes
6f4f21f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import time

import pandas as pd
import streamlit as st 
import matplotlib.pyplot as plt

from data_utils.data_simulation import UpliftSimulationReady
from data_utils.eda_simulation import EDASimulationReady
from data_utils.feature_importance_simulation import FISimulationReady
from models_utils.models_simulation import CATESimulationReady
from eval_utils.evaluation_simulation import CATEConversionEvaluateSimulationReady, CATEBenefitEvaluateSimulationReady

from mlops_utils.wandb_utils import upload_dataset_to_wandb, eda_work_with_dataset_to_wandb, training_results_to_wandb

st.title('Causal Uplift Modeling')
tabs = st.sidebar.radio("Navigation", ["Data", "EDA", "Modeling", "Effect"])

if tabs == "Data":

    # Needed raw data
    uplift_simulation = UpliftSimulationReady('./data/raw_data_client/')
    user_profiles = uplift_simulation.load_user_profiles('user_profiles.csv')
    uplift_data = uplift_simulation.load_uplift_data('uplift_data.csv')
    irrelevant_data = uplift_simulation.load_irrelevant_data('irrelevant_data.csv')
    transaction_data = uplift_simulation.load_other_data('transaction_data.csv')

    # Subtitle
    st.subheader('Loading data')

    st.write('User profiles')
    st.write(user_profiles.head(5))

    st.write('Uplift data')
    st.write(uplift_data.head(5))

    st.write('Other data')
    st.write(irrelevant_data.head(5))

    st.write('Transaction data')
    st.write(transaction_data.head(5))

    if st.button('Upload data to wandb'):
        upload_dataset_to_wandb(['./data/raw_data_client'], 'nl_cate_modeling', 'uplift_data')
        st.write('Data uploaded to wandb')

    # TODO: add to WANDB data processing step in the beginning
    # TODO: the tree of updates
    # TODO: choose the version from MLOps here exactly

if tabs == "EDA":

    eda_simulation = EDASimulationReady('./data/processed_data/')
    sum_conversions, mean_conversions = eda_simulation.load_conversions('uplift_classification_processed.csv')

    st.subheader('Exploratory Data Analysis')

    st.write('We can begin by computing the total sum of conversions, sales (discounted price) and platform benefit. We can see that the total conversions and the total sales grows as the discount value is bigger. However the platform benefit decreases.')
    st.write(sum_conversions)

    st.write('We can repeat the analysis but using the mean instead of the sum. This will give us the mean conversion rate, the mean sales per user and the mean platform benefit per user.')
    st.write(mean_conversions)

    st.write('To illustrate the tradeoff between conversions and platform benefit we can plot the mean benefit per user in the y-axis and the mean conversion rate in the x-axis, per treatment group.')

    df_pivot_mean = mean_conversions[['mean']]
    df_pivot_mean.columns = df_pivot_mean.columns.droplevel()
    
    fig, ax = plt.subplots()
    df_pivot_mean.plot.scatter(x='conversion',
                               y='benefit',
                               c='DarkBlue',
                               s=50,
                               ax=ax)
    st.pyplot(fig)

    st.write('''
             We can also compute the Average Treatment Effect (ATE) for both the mean conversion rate and the mean benefit per user:
            Conversion ATE = Mean Converstion rate in discounted group minus Mean Conversion rate in control group
            Benefit ATE = Mean Benefit per user in discounted group minus Mean Benefit per user in control group
            We can see in the plot below that the bigger the discount value the stronger the Conversion ATE (x-axis), but at the same time the more negative the Benefit ATE (y-axis).
             ''')
    
    df_pivot_mean_ate = df_pivot_mean - df_pivot_mean.loc['control'].values.squeeze()
    df_pivot_mean_ate.columns = ['benefit_ate', 'conversion_ate', 'discounted_price_ate']

    fig, ax = plt.subplots()
    df_pivot_mean_ate.plot.scatter(x='conversion_ate',
                        y='benefit_ate',
                        c='DarkBlue',
                        s=50,
                        ax=ax)
    st.pyplot(fig)

    st.subheader('Feature Importance')

    fi = FISimulationReady('./data/eda_data/')
    di_df = fi.load_feature_importance('kl_feature_importance.csv')

    st.write('Feature importance')
    fig, ax = plt.subplots()
    di_df_sorted = di_df.sort_values(by='score', ascending=False)
    di_df_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
    st.pyplot(fig)

    if st.button('Upload EDA to wandb'):
        eda_work_with_dataset_to_wandb(
                                    dirs = ['./data/eda_data/'],
                                    project_name = 'nl_cate_modeling',
                                    dataset_name = 'uplift_data:latest',
                                    dataset_type = 'raw_dataset',
                                    artifact_type = 'eda')
        st.write('EDA uploaded to wandb')

    # TODO: add report to WANDB
    # TODO: add artifacts to WANDB

if tabs == "Modeling":

    st.subheader('Causal ML modeling')

    st.write('We can begin by modeling the Conditional Average Treatment Effect')
    if st.button('Train & run CATE conversion model'):
        # fake trainin via 5 seconds spinner
        with st.spinner('Training model...'):
            time.sleep(2)

        st.subheader('Feature importance by discount group')

        model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
        y_pred = model.predict()

        fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance.csv')
        fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance.csv')
        fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance.csv')

        st.write('5\% discount group')
        # plot feature importance as bar chart
        fig, ax = plt.subplots()
        fi05_sorted = fi05.sort_values(by='score', ascending=False)
        fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('10\% discount group')
        fig, ax = plt.subplots()
        fi10_sorted = fi10.sort_values(by='score', ascending=False)
        fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('15\% discount group')
        fig, ax = plt.subplots()
        fi15_sorted = fi15.sort_values(by='score', ascending=False)
        fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)
    if st.button('Upload convesion model to wandb'):
        training_results_to_wandb(['./data/models_data'], 
                                'nl_cate_modeling', 
                                'uplift_data:latest', 
                                'raw_dataset', 
                                'model_artifacts', 
                                'causal_model_conversion')
        st.write('Models uploaded to wandb')

    st.write('Similarly we can now train a T-Learner on the benefit label, and use the model predictions to evaluate the performance on the CATE conversion and CATE benefit.')
    if st.button('Train & run CATE benefit model'):
        # fake trainin via 5 seconds spinner
        with st.spinner('Training model...'):
            time.sleep(2)

        st.subheader('Feature importance by discount group')

        model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
        y_pred = model.predict()

        fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance_bate.csv')
        fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance_bate.csv')
        fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance_bate.csv')

        st.write('5\% discount group')
        # plot feature importance as bar chart
        fig, ax = plt.subplots()
        fi05_sorted = fi05.sort_values(by='score', ascending=False)
        fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('10\% discount group')
        fig, ax = plt.subplots()
        fi10_sorted = fi10.sort_values(by='score', ascending=False)
        fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('15\% discount group')
        fig, ax = plt.subplots()
        fi15_sorted = fi15.sort_values(by='score', ascending=False)
        fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)
    if st.button('Upload benefit model to wandb'):
        training_results_to_wandb(['./data/models_data'], 
                                'nl_cate_modeling', 
                                'uplift_data:latest', 
                                'raw_dataset', 
                                'model_artifacts', 
                                'causal_model_benefit')
        st.write('Models uploaded to wandb')

if tabs == "Effect":
    
    st.subheader('Causal ML evaluation')
    st.write('We can evaluate our models by looking at the Qini curves. We can use the CATE conversion model to evaluate the performance on both the Conversion and the Benefit as a function of the fraction of users targeted.')

    # two columns
    col1, col2 = st.columns(2)

    with col1:
        
        st.write('CATE conversion model')

        eval = CATEConversionEvaluateSimulationReady('./data/effect_data/')
        qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
        qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
        qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)

        # Plot CATE conversion vs Targeted Population
        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
        ax_conversion.legend()
        ax_conversion.set_xlabel('Fraction of Targeted Users')
        ax_conversion.set_ylabel('CATE conversion')
        ax_conversion.set_title('CATE conversion vs Targeted Population')
        st.pyplot(fig_conversion)

        # Plot CATE benefit vs Targeted Population
        fig_benefit, ax_benefit = plt.subplots()
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
        ax_benefit.legend()
        ax_benefit.set_xlabel('Fraction of Targeted Users')
        ax_benefit.set_ylabel('CATE Benefit')
        ax_benefit.set_title('CATE benefit vs Targeted Population')
        st.pyplot(fig_benefit)

        qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
        qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
        qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
        qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']

        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
        qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
        qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
        ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
        ax_conversion.set_xlabel('CATE Conversion')
        ax_conversion.set_ylabel('CATE Benefit')
        ax_conversion.set_title('CATE benefit vs CATE conversion')
        st.pyplot(fig_conversion)

        if st.button('Upload conversion effects to wandb'):
            training_results_to_wandb(['./data/effect_data'], 
                                    'nl_cate_modeling', 
                                    'causal_model_conversion:latest', 
                                    'model_artifacts', 
                                    'effects_artifacts', 
                                    'convesion_model_evaluation',
                                    job_type='evaluation')
            st.write('Evaluation uploaded to wandb')

    with col2:
        st.write('CATE benefit model')

        eval = CATEBenefitEvaluateSimulationReady('./data/effect_data/')
        qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
        qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
        qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)

        # Plot CATE conversion vs Targeted Population
        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
        ax_conversion.legend()
        ax_conversion.set_xlabel('Fraction of Targeted Users')
        ax_conversion.set_ylabel('CATE conversion')
        ax_conversion.set_title('CATE conversion vs Targeted Population')
        st.pyplot(fig_conversion)

        # Plot CATE benefit vs Targeted Population
        fig_benefit, ax_benefit = plt.subplots()
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
        ax_benefit.legend()
        ax_benefit.set_xlabel('Fraction of Targeted Users')
        ax_benefit.set_ylabel('CATE Benefit')
        ax_benefit.set_title('CATE benefit vs Targeted Population')
        st.pyplot(fig_benefit)

        qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
        qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
        qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
        qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']

        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
        qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
        qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
        ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
        ax_conversion.set_xlabel('CATE Conversion')
        ax_conversion.set_ylabel('CATE Benefit')
        ax_conversion.set_title('CATE benefit vs CATE conversion')
        st.pyplot(fig_conversion)

        if st.button('Upload benefit effects to wandb'):
            training_results_to_wandb(['./data/effect_data'], 
                                    'nl_cate_modeling', 
                                    'causal_model_benefit:latest', 
                                    'model_artifacts', 
                                    'effects_artifacts', 
                                    'benefit_model_evaluation',
                                    job_type='evaluation')
            st.write('Evaluation uploaded to wandb')

    st.write('To simplify the comparison, we can plot the CATE Benefit as a function of the CATE conversion.')
    st.write('In the last plot for example we can see that there is a region where offering 15% discount to a targeted group of users is more efficient than giving 10% to everyone. We can obtain the same impact in overall conversion uplift while reducing our benefit loss considerably.')