Foundations of Predictive Analytics in Python-2-Chapter 2

Browse more Python3 Examples
# =============================================================================
# Adding predicitve variables
# =============================================================================

''' Adding age '''

# Reference date
reference_date = datetime.date(2017, 5, 1)

# Add age to the basetable
basetable["age"] = (pd.Series([calculate_age(date_of_birth, reference_date)
                              for date_of_birth in basetable["date_of_birth"]]))

# Calculate mean age
print(round(basetable["age"].mean()))

''' Adding the donor segment '''

# basetable
'''
      donor_id
0        98304
1            4
2        32778
...        ...
9488     65524
9489     98293
9490     32759
'''
# segments
'''
      donor_id segment
1379     69844  silver
1191     36363    gold
681      67425  bronze
...        ...     ...
9153     97124  silver
6847     56218  silver
6134     53780  silver
'''

# Add the donor segment to the basetable
# Left join（right join 就只有交集，沒有 NaN）
basetable = pd.merge(basetable, segments, on =["donor_id"], how="left")
'''
      donor_id segment
0        98304     NaN
1            4     NaN
2        32778  bronze
...        ...     ...
9488     65524    gold
9489     98293  silver
9490     32759     NaN
'''

# Count the number of donors in each segment
basetable.groupby("segment").size()

# Count the number of donors with no segment assigned
print(basetable["segment"].isna().sum())

''' Adding living place '''

# living_places
'''
       donor_ID start_date   end_date living_place
0         32768 1989-03-25 2000-07-06        India
1         32768 2000-07-06 2099-01-01           UK
2         98305 1954-04-06 1958-08-12          USA
...         ...        ...        ...          ...
28803     32765 1998-03-09 2005-01-30           UK
28804     32765 2005-01-30 2012-11-28           UK
28805     32765 2012-11-28 2099-01-01        India
'''

# Reference date
reference_date = datetime.date(2017, 5, 1)

# Select living place reference date
living_places_reference_date = living_places[(living_places["start_date"] <= reference_date) & 
                                            (living_places["end_date"] > reference_date)]

# Add living place to the basetable
basetable = pd.merge(basetable, living_places_reference_date[["donor_ID", "living_place"]], on="donor_ID")

# =============================================================================
# Adding aggregated variables
# =============================================================================

''' Maximum value last year '''

# basetable
'''
   donor_ID
0         3
1         5
2         6
'''

# gifts
'''
   id       date  amount
0   1 2015-10-16    75.0
1   1 2014-02-11   111.0
2   1 2012-03-28    93.0
'''

# Start and end date of the aggregation period
start_date = datetime.date(2017, 1, 1)
end_date = datetime.date(2017, 5, 1)

# Select gifts made in 2017 and before May 1st
gifts_2017 = gifts[(gifts["date"] >= start_date) & (gifts["date"] < end_date)]

# Maximum gift per donor in 2017
gifts_2017_bydonor = gifts_2017.groupby(["id"])["amount"].max().reset_index()
gifts_2017_bydonor.columns = ["donor_ID", "max_amount"]

# Add maximum amount to the basetable
basetable = pd.merge(basetable, gifts_2017_bydonor)

''' Recency of donations '''

# Reference date to calculate the recency
reference_date = datetime.date(2017, 5, 1)

# Select gifts made before the reference date
gifts_before_reference = gifts[(gifts["date"] < reference_date)]

# Latest gift per donor in 2017
last_gift = gifts_before_reference.groupby(["id"])["date"].max().reset_index()
last_gift["recency"] = reference_date - last_gift["date"]   

# Add recency to the basetable
basetable = pd.merge(basetable, last_gift[["id", "recency"]], how="left")

print(basetable)
'''
             id   recency
    0         3  396 days
    1         5  291 days
    2         6  414 days
    ...     ...       ...
    8508  29983       NaT
    8509  29986       NaT
    8510  29991       NaT
'''

# =============================================================================
# Adding evolutions
# =============================================================================

''' Ratio of last month's and last year's average '''

# Given are gifts_last_month (donors who donated last month) and gifts_last_year (donors who donated last year)

# Average gift last month for each donor
average_gift_last_month = gifts_last_month.groupby("id")["amount"].mean().reset_index()
average_gift_last_month.columns = ["donor_ID", "mean_gift_last_month"]

# Average gift last year for each donor
average_gift_last_year = gifts_last_year.groupby("id")["amount"].mean().reset_index()
average_gift_last_year.columns = ["donor_ID", "mean_gift_last_year"]

# Add average gift last month and year to basetable
basetable = pd.merge(basetable, average_gift_last_month, on="donor_ID", how="left")
basetable = pd.merge(basetable, average_gift_last_year, on="donor_ID", how="left")

# Calculate ratio of last month's and last year's average
basetable["ratio_month_year"] = basetable["mean_gift_last_month"] / basetable["mean_gift_last_year"]

''' Absolute difference between two years '''

# Number of gifts in 2016 and 2017 for each donor
gifts_2016_bydonor = gifts_2016.groupby("id")["amount"].count().reset_index()
gifts_2016_bydonor.columns = ["donor_ID", "donations_2016"]
gifts_2017_bydonor = gifts_2017.groupby("id")["amount"].count().reset_index()
gifts_2017_bydonor.columns = ["donor_ID", "donations_2017"]

# Add number of gifts in 2016 and 2017 to the basetable
basetable = pd.merge(basetable, gifts_2016_bydonor, on="donor_ID", how="left")
basetable = pd.merge(basetable, gifts_2017_bydonor, on="donor_ID", how="left")

# Calculate the number of gifts in 2017 minus number of gifts in 2016
basetable.fillna(0)
basetable["gifts_2017_min_2016"] = basetable["donations_2017"] - basetable["donations_2016"]
print(basetable.head())

# =============================================================================
# Using evolution variables
# =============================================================================

''' Performance of evolution variables '''

# variables_regular   = ["gender_F", "age", "donations_2017"]
# variables_evolution = ["gender_F", "age", "donations_2017_min_2016"]

# Select the evolution variables and fit the model
X_evolution = basetable[variables_evolution]
logreg.fit(X_evolution, y)

# Make predictions and calculate the AUC
predictions_evolution = logreg.predict_proba(X_evolution)[:,1]
auc_evolution = roc_auc_score(y, predictions_evolution)

# Print the respective AUC values（假設 auc_regular 已經算好了）
print(round(auc_regular, 2))    # 0.6
print(round(auc_evolution, 2))  # 0.7

''' Meaning of evolution: plot the pig table '''

# Discretize the variable in 5 bins and add to the basetable
basetable["donations_2017_min_2016_disc"] = pd.qcut(basetable["donations_2017_min_2016"], 5)

# Construct the predictor insight graph table
pig_table = create_pig_table(basetable, "target", "donations_2017_min_2016_disc")

# Plot the predictor insight graph
plot_pig(pig_table, "donations_2017_min_2016_disc")
Foundations of Predictive Analytics in Python-2-Chapter 2

Follow

Newsletter