Spaces:

neuronslabs
/

uplift_modeling

Sleeping

File size: 18,113 Bytes

6f4f21f

import time

import pandas as pd
import streamlit as st 
import matplotlib.pyplot as plt

from data_utils.data_simulation import UpliftSimulationReady
from data_utils.eda_simulation import EDASimulationReady
from data_utils.feature_importance_simulation import FISimulationReady
from models_utils.models_simulation import CATESimulationReady
from eval_utils.evaluation_simulation import CATEConversionEvaluateSimulationReady, CATEBenefitEvaluateSimulationReady

from mlops_utils.wandb_utils import upload_dataset_to_wandb, eda_work_with_dataset_to_wandb, training_results_to_wandb

st.title('Causal Uplift Modeling')
tabs = st.sidebar.radio("Navigation", ["Data", "EDA", "Modeling", "Effect"])

if tabs == "Data":

    # Needed raw data
    uplift_simulation = UpliftSimulationReady('./data/raw_data_client/')
    user_profiles = uplift_simulation.load_user_profiles('user_profiles.csv')
    uplift_data = uplift_simulation.load_uplift_data('uplift_data.csv')
    irrelevant_data = uplift_simulation.load_irrelevant_data('irrelevant_data.csv')
    transaction_data = uplift_simulation.load_other_data('transaction_data.csv')

    # Subtitle
    st.subheader('Loading data')

    st.write('User profiles')
    st.write(user_profiles.head(5))

    st.write('Uplift data')
    st.write(uplift_data.head(5))

    st.write('Other data')
    st.write(irrelevant_data.head(5))

    st.write('Transaction data')
    st.write(transaction_data.head(5))

    if st.button('Upload data to wandb'):
        upload_dataset_to_wandb(['./data/raw_data_client'], 'nl_cate_modeling', 'uplift_data')
        st.write('Data uploaded to wandb')

    # TODO: add to WANDB data processing step in the beginning
    # TODO: the tree of updates
    # TODO: choose the version from MLOps here exactly

if tabs == "EDA":

    eda_simulation = EDASimulationReady('./data/processed_data/')
    sum_conversions, mean_conversions = eda_simulation.load_conversions('uplift_classification_processed.csv')

    st.subheader('Exploratory Data Analysis')

    st.write('We can begin by computing the total sum of conversions, sales (discounted price) and platform benefit. We can see that the total conversions and the total sales grows as the discount value is bigger. However the platform benefit decreases.')
    st.write(sum_conversions)

    st.write('We can repeat the analysis but using the mean instead of the sum. This will give us the mean conversion rate, the mean sales per user and the mean platform benefit per user.')
    st.write(mean_conversions)

    st.write('To illustrate the tradeoff between conversions and platform benefit we can plot the mean benefit per user in the y-axis and the mean conversion rate in the x-axis, per treatment group.')

    df_pivot_mean = mean_conversions[['mean']]
    df_pivot_mean.columns = df_pivot_mean.columns.droplevel()
    
    fig, ax = plt.subplots()
    df_pivot_mean.plot.scatter(x='conversion',
                               y='benefit',
                               c='DarkBlue',
                               s=50,
                               ax=ax)
    st.pyplot(fig)

    st.write('''
             We can also compute the Average Treatment Effect (ATE) for both the mean conversion rate and the mean benefit per user:
            Conversion ATE = Mean Converstion rate in discounted group minus Mean Conversion rate in control group
            Benefit ATE = Mean Benefit per user in discounted group minus Mean Benefit per user in control group
            We can see in the plot below that the bigger the discount value the stronger the Conversion ATE (x-axis), but at the same time the more negative the Benefit ATE (y-axis).
             ''')
    
    df_pivot_mean_ate = df_pivot_mean - df_pivot_mean.loc['control'].values.squeeze()
    df_pivot_mean_ate.columns = ['benefit_ate', 'conversion_ate', 'discounted_price_ate']

    fig, ax = plt.subplots()
    df_pivot_mean_ate.plot.scatter(x='conversion_ate',
                        y='benefit_ate',
                        c='DarkBlue',
                        s=50,
                        ax=ax)
    st.pyplot(fig)

    st.subheader('Feature Importance')

    fi = FISimulationReady('./data/eda_data/')
    di_df = fi.load_feature_importance('kl_feature_importance.csv')

    st.write('Feature importance')
    fig, ax = plt.subplots()
    di_df_sorted = di_df.sort_values(by='score', ascending=False)
    di_df_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
    st.pyplot(fig)

    if st.button('Upload EDA to wandb'):
        eda_work_with_dataset_to_wandb(
                                    dirs = ['./data/eda_data/'],
                                    project_name = 'nl_cate_modeling',
                                    dataset_name = 'uplift_data:latest',
                                    dataset_type = 'raw_dataset',
                                    artifact_type = 'eda')
        st.write('EDA uploaded to wandb')

    # TODO: add report to WANDB
    # TODO: add artifacts to WANDB

if tabs == "Modeling":

    st.subheader('Causal ML modeling')

    st.write('We can begin by modeling the Conditional Average Treatment Effect')
    if st.button('Train & run CATE conversion model'):
        # fake trainin via 5 seconds spinner
        with st.spinner('Training model...'):
            time.sleep(2)

        st.subheader('Feature importance by discount group')

        model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
        y_pred = model.predict()

        fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance.csv')
        fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance.csv')
        fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance.csv')

        st.write('5\% discount group')
        # plot feature importance as bar chart
        fig, ax = plt.subplots()
        fi05_sorted = fi05.sort_values(by='score', ascending=False)
        fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('10\% discount group')
        fig, ax = plt.subplots()
        fi10_sorted = fi10.sort_values(by='score', ascending=False)
        fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('15\% discount group')
        fig, ax = plt.subplots()
        fi15_sorted = fi15.sort_values(by='score', ascending=False)
        fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)
    if st.button('Upload convesion model to wandb'):
        training_results_to_wandb(['./data/models_data'], 
                                'nl_cate_modeling', 
                                'uplift_data:latest', 
                                'raw_dataset', 
                                'model_artifacts', 
                                'causal_model_conversion')
        st.write('Models uploaded to wandb')

    st.write('Similarly we can now train a T-Learner on the benefit label, and use the model predictions to evaluate the performance on the CATE conversion and CATE benefit.')
    if st.button('Train & run CATE benefit model'):
        # fake trainin via 5 seconds spinner
        with st.spinner('Training model...'):
            time.sleep(2)

        st.subheader('Feature importance by discount group')

        model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
        y_pred = model.predict()

        fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance_bate.csv')
        fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance_bate.csv')
        fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance_bate.csv')

        st.write('5\% discount group')
        # plot feature importance as bar chart
        fig, ax = plt.subplots()
        fi05_sorted = fi05.sort_values(by='score', ascending=False)
        fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('10\% discount group')
        fig, ax = plt.subplots()
        fi10_sorted = fi10.sort_values(by='score', ascending=False)
        fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)

        st.write('15\% discount group')
        fig, ax = plt.subplots()
        fi15_sorted = fi15.sort_values(by='score', ascending=False)
        fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
        st.pyplot(fig)
    if st.button('Upload benefit model to wandb'):
        training_results_to_wandb(['./data/models_data'], 
                                'nl_cate_modeling', 
                                'uplift_data:latest', 
                                'raw_dataset', 
                                'model_artifacts', 
                                'causal_model_benefit')
        st.write('Models uploaded to wandb')

if tabs == "Effect":
    
    st.subheader('Causal ML evaluation')
    st.write('We can evaluate our models by looking at the Qini curves. We can use the CATE conversion model to evaluate the performance on both the Conversion and the Benefit as a function of the fraction of users targeted.')

    # two columns
    col1, col2 = st.columns(2)

    with col1:
        
        st.write('CATE conversion model')

        eval = CATEConversionEvaluateSimulationReady('./data/effect_data/')
        qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
        qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
        qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)

        # Plot CATE conversion vs Targeted Population
        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
        ax_conversion.legend()
        ax_conversion.set_xlabel('Fraction of Targeted Users')
        ax_conversion.set_ylabel('CATE conversion')
        ax_conversion.set_title('CATE conversion vs Targeted Population')
        st.pyplot(fig_conversion)

        # Plot CATE benefit vs Targeted Population
        fig_benefit, ax_benefit = plt.subplots()
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
        ax_benefit.legend()
        ax_benefit.set_xlabel('Fraction of Targeted Users')
        ax_benefit.set_ylabel('CATE Benefit')
        ax_benefit.set_title('CATE benefit vs Targeted Population')
        st.pyplot(fig_benefit)

        qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
        qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
        qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
        qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']

        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
        qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
        qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
        ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
        ax_conversion.set_xlabel('CATE Conversion')
        ax_conversion.set_ylabel('CATE Benefit')
        ax_conversion.set_title('CATE benefit vs CATE conversion')
        st.pyplot(fig_conversion)

        if st.button('Upload conversion effects to wandb'):
            training_results_to_wandb(['./data/effect_data'], 
                                    'nl_cate_modeling', 
                                    'causal_model_conversion:latest', 
                                    'model_artifacts', 
                                    'effects_artifacts', 
                                    'convesion_model_evaluation',
                                    job_type='evaluation')
            st.write('Evaluation uploaded to wandb')

    with col2:
        st.write('CATE benefit model')

        eval = CATEBenefitEvaluateSimulationReady('./data/effect_data/')
        qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
        qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
        qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)

        # Plot CATE conversion vs Targeted Population
        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
        qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
        qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
        ax_conversion.legend()
        ax_conversion.set_xlabel('Fraction of Targeted Users')
        ax_conversion.set_ylabel('CATE conversion')
        ax_conversion.set_title('CATE conversion vs Targeted Population')
        st.pyplot(fig_conversion)

        # Plot CATE benefit vs Targeted Population
        fig_benefit, ax_benefit = plt.subplots()
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
        qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
        qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
        qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
        ax_benefit.legend()
        ax_benefit.set_xlabel('Fraction of Targeted Users')
        ax_benefit.set_ylabel('CATE Benefit')
        ax_benefit.set_title('CATE benefit vs Targeted Population')
        st.pyplot(fig_benefit)

        qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
        qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
        qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
        qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
        qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']

        fig_conversion, ax_conversion = plt.subplots()
        qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
        qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
        qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
        ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
        ax_conversion.set_xlabel('CATE Conversion')
        ax_conversion.set_ylabel('CATE Benefit')
        ax_conversion.set_title('CATE benefit vs CATE conversion')
        st.pyplot(fig_conversion)

        if st.button('Upload benefit effects to wandb'):
            training_results_to_wandb(['./data/effect_data'], 
                                    'nl_cate_modeling', 
                                    'causal_model_benefit:latest', 
                                    'model_artifacts', 
                                    'effects_artifacts', 
                                    'benefit_model_evaluation',
                                    job_type='evaluation')
            st.write('Evaluation uploaded to wandb')

    st.write('To simplify the comparison, we can plot the CATE Benefit as a function of the CATE conversion.')
    st.write('In the last plot for example we can see that there is a region where offering 15% discount to a targeted group of users is more efficient than giving 10% to everyone. We can obtain the same impact in overall conversion uplift while reducing our benefit loss considerably.')