''' Retrieving information from the predictor insight table ''' # Inspect the predictor insight graph table of Country print(pig_table) ''' Country Size Incidence 0 India 49849 0.05 1 UK 10057 0.05 2 USA 40094 0.05 ''' # Print the number of UK donors print(pig_table["Size"][pig_table["Country"]=="UK"]) # Check the target incidence of USA and India donors print(pig_table["Incidence"][pig_table["Country"]=="USA"]) print(pig_table["Incidence"][pig_table["Country"]=="India"]) # ============================================================================= # Discretization of continuous variables # ============================================================================= ''' Discretization of a certain variable ''' print(basetable) ''' Unnamed: 0 target time_since_last_donation 0 0 1 808 1 1 1 977 2 2 1 641 ... ... ... ... 99997 99997 0 703 99998 99998 0 682 99999 99999 0 666 ''' # Discretize the variable time_since_last_donation in 10 bins basetable["bins_recency"] = pd.qcut(basetable["time_since_last_donation"],10) print(basetable["bins_recency"]) ''' 0 (738, 833] 1 (933, 1050] 2 (574, 657] ... 99997 (657, 738] 99998 (657, 738] 99999 (657, 738] Categories (10, object): [[32, 319] < (319, 462] < (462, 574] < (574, 657] ... (833, 933] < (933, 1050] < (1050, 1209] < (1209, 2518]] ''' # Print the group sizes of the discretized variable print(basetable.groupby("bins_recency").size()) ''' [32, 319] 10058 (319, 462] 9953 (462, 574] 9999 ... (933, 1050] 10009 (1050, 1209] 10004 (1209, 2518] 9949 ''' ''' Discretizing all variables ''' # Get all the variable names except "target" variables = list(basetable.columns) variables.remove("target") # Loop through all the variables and discretize in 10 bins if there are more than 5 different values for variable in variables: if len(basetable.groupby(variable))>5: new_variable = "disc_" + variable basetable[new_variable] = pd.qcut(basetable[variable], 10) ''' Making clean cuts ''' # Discretize the variable basetable["disc_number_gift"] = pd.cut(basetable["number_gift"],[0, 5, 10, 20]) # Count the number of observations per group print(basetable.groupby("disc_number_gift").size()) ''' (0, 5] 55063 (5, 10] 41120 (10, 20] 3817 ''' ''' Calculating average incidences ''' # Select the income and target columns basetable_income = basetable[["target","income"]] print(basetable_income) ''' target income 0 1 high 1 1 average 2 1 high ... ... ... 99997 0 average 99998 0 low 99999 0 low ''' # Group basetable_income by income groups = basetable_income.groupby("income") # Calculate the target incidence and print the result incidence = groups["target"].agg({"Incidence" : np.mean}).reset_index() print(incidence) ''' income Incidence 0 average 0.049166 1 high 0.061543 2 low 0.043118 ''' # ============================================================================= # Preparing the predictor insight graph table # ============================================================================= ''' Constructing the predictor insight graph table ''' # Function that creates predictor insight graph table def create_pig_table(basetable, target, variable): # Create groups for each variable groups = basetable[[target,variable]].groupby(variable) # Calculate size and target incidence for each group pig_table = groups["target"].agg({'Incidence' : np.mean, 'Size' : np.size}).reset_index() # Return the predictor insight graph table return pig_table # Calculate the predictor insight graph table for the variable gender pig_table_gender = create_pig_table(basetable, "target", "gender") # Print the result print(pig_table_gender) ''' gender Size Incidence 0 F 50033 0.053844 1 M 49967 0.045970 ''' ''' Grouping all predictor insight graph tables ''' # Create the list of variables for our predictor insight graph tables variables = ["income","gender","disc_mean_gift","disc_time_since_last_gift"] # Create an empty dictionary pig_tables = {} # Loop through the variables for variable in variables: # Create a predictor insight graph table pig_table = create_pig_table(basetable, "target", variable) # Add the table to the dictionary pig_tables[variable] = pig_table # Print the predictor insight graph table of the variable "disc_time_since_last_gift" print(pig_tables) ''' {'disc_time_since_last_gift': disc_time_since_last_gift Size Incidence 0 (1050, 2518] 19953 0.023255 1 (462, 657] 20069 0.061986 2 (657, 833] 19996 0.050810 3 (833, 1050] 19971 0.033799 4 [32, 462] 20011 0.079556, 'disc_mean_gift': disc_mean_gift Size Incidence 0 (103, 197] 19551 0.103524 1 (78.111, 86.889] 19997 0.029554 2 (86.889, 94.167] 20034 0.040831 3 (94.167, 103] 20405 0.063563 4 [2, 78.111] 20013 0.013042, 'gender': gender Size Incidence 0 F 50033 0.053844 1 M 49967 0.045970, 'income': income Size Incidence 0 average 62950 0.049166 1 high 16200 0.061543 2 low 20850 0.043118} ''' # ============================================================================= # Plotting the predictor insight graph # ============================================================================= ''' Plotting the incidences ''' import matplotlib.pyplot as plt import numpy as np # The function to plot a predictor insight graph. def plot_incidence(pig_table,variable): # Plot the incidence line pig_table["incidence"].plot() # Formatting the predictor insight graph plt.xticks(np.arange(len(pig_table)), pig_table[variable]) plt.xlim([-0.5, len(pig_table) - 0.5]) plt.ylim([0, max(pig_table["Incidence"] * 2)]) plt.ylabel("Incidence", rotation=0, rotation_mode="anchor", ha="right") plt.xlabel(variable) # Show the graph plt.show() # Apply the function for the variable "country". plot_incidence(pig_table, "country") ''' Plotting the group sizes ''' # The function to plot a predictor insight graph def plot_pig(pig_table, variable): # Plot formatting plt.ylabel("Size", rotation=0, rotation_mode="anchor", ha="right") # Plot the bars with sizes pig_table["Size"].plot(kind="bar", width=0.5, color="lightgray", edgecolor="none") # Plot the incidence line on secondary axis pig_table["Incidence"].plot(secondary_y=True) # Plot formatting plt.xticks(np.arange(len(pig_table)), pig_table[variable]) plt.xlim([-0.5, len(pig_table) - 0.5]) plt.ylabel("Incidence", rotation=0, rotation_mode="anchor", ha="left") # Show the graph plt.show() # Apply the function for the variable "country" plot_pig(pig_table, "country") ''' Putting it all together ''' # Variables you want to make predictor insight graph tables for variables = ["income","gender","disc_mean_gift","disc_time_since_last_gift"] # Loop through the variables for variable in variables: # Create the predictor insight graph table pig_table = create_pig_table(basetable, "target", variable) # Plot the predictor insight graph plot_pig(pig_table, variable)
We use cookies to provide and improve our services. By using our site, you consent to our Cookies Policy. Accept Learn more