# ============================================================================= # Adding predicitve variables # ============================================================================= ''' Adding age ''' # Reference date reference_date = datetime.date(2017, 5, 1) # Add age to the basetable basetable["age"] = (pd.Series([calculate_age(date_of_birth, reference_date) for date_of_birth in basetable["date_of_birth"]])) # Calculate mean age print(round(basetable["age"].mean())) ''' Adding the donor segment ''' # basetable ''' donor_id 0 98304 1 4 2 32778 ... ... 9488 65524 9489 98293 9490 32759 ''' # segments ''' donor_id segment 1379 69844 silver 1191 36363 gold 681 67425 bronze ... ... ... 9153 97124 silver 6847 56218 silver 6134 53780 silver ''' # Add the donor segment to the basetable # Left join(right join 就只有交集,沒有 NaN) basetable = pd.merge(basetable, segments, on =["donor_id"], how="left") ''' donor_id segment 0 98304 NaN 1 4 NaN 2 32778 bronze ... ... ... 9488 65524 gold 9489 98293 silver 9490 32759 NaN ''' # Count the number of donors in each segment basetable.groupby("segment").size() # Count the number of donors with no segment assigned print(basetable["segment"].isna().sum()) ''' Adding living place ''' # living_places ''' donor_ID start_date end_date living_place 0 32768 1989-03-25 2000-07-06 India 1 32768 2000-07-06 2099-01-01 UK 2 98305 1954-04-06 1958-08-12 USA ... ... ... ... ... 28803 32765 1998-03-09 2005-01-30 UK 28804 32765 2005-01-30 2012-11-28 UK 28805 32765 2012-11-28 2099-01-01 India ''' # Reference date reference_date = datetime.date(2017, 5, 1) # Select living place reference date living_places_reference_date = living_places[(living_places["start_date"] <= reference_date) & (living_places["end_date"] > reference_date)] # Add living place to the basetable basetable = pd.merge(basetable, living_places_reference_date[["donor_ID", "living_place"]], on="donor_ID") # ============================================================================= # Adding aggregated variables # ============================================================================= ''' Maximum value last year ''' # basetable ''' donor_ID 0 3 1 5 2 6 ''' # gifts ''' id date amount 0 1 2015-10-16 75.0 1 1 2014-02-11 111.0 2 1 2012-03-28 93.0 ''' # Start and end date of the aggregation period start_date = datetime.date(2017, 1, 1) end_date = datetime.date(2017, 5, 1) # Select gifts made in 2017 and before May 1st gifts_2017 = gifts[(gifts["date"] >= start_date) & (gifts["date"] < end_date)] # Maximum gift per donor in 2017 gifts_2017_bydonor = gifts_2017.groupby(["id"])["amount"].max().reset_index() gifts_2017_bydonor.columns = ["donor_ID", "max_amount"] # Add maximum amount to the basetable basetable = pd.merge(basetable, gifts_2017_bydonor) ''' Recency of donations ''' # Reference date to calculate the recency reference_date = datetime.date(2017, 5, 1) # Select gifts made before the reference date gifts_before_reference = gifts[(gifts["date"] < reference_date)] # Latest gift per donor in 2017 last_gift = gifts_before_reference.groupby(["id"])["date"].max().reset_index() last_gift["recency"] = reference_date - last_gift["date"] # Add recency to the basetable basetable = pd.merge(basetable, last_gift[["id", "recency"]], how="left") print(basetable) ''' id recency 0 3 396 days 1 5 291 days 2 6 414 days ... ... ... 8508 29983 NaT 8509 29986 NaT 8510 29991 NaT ''' # ============================================================================= # Adding evolutions # ============================================================================= ''' Ratio of last month's and last year's average ''' # Given are gifts_last_month (donors who donated last month) and gifts_last_year (donors who donated last year) # Average gift last month for each donor average_gift_last_month = gifts_last_month.groupby("id")["amount"].mean().reset_index() average_gift_last_month.columns = ["donor_ID", "mean_gift_last_month"] # Average gift last year for each donor average_gift_last_year = gifts_last_year.groupby("id")["amount"].mean().reset_index() average_gift_last_year.columns = ["donor_ID", "mean_gift_last_year"] # Add average gift last month and year to basetable basetable = pd.merge(basetable, average_gift_last_month, on="donor_ID", how="left") basetable = pd.merge(basetable, average_gift_last_year, on="donor_ID", how="left") # Calculate ratio of last month's and last year's average basetable["ratio_month_year"] = basetable["mean_gift_last_month"] / basetable["mean_gift_last_year"] ''' Absolute difference between two years ''' # Number of gifts in 2016 and 2017 for each donor gifts_2016_bydonor = gifts_2016.groupby("id")["amount"].count().reset_index() gifts_2016_bydonor.columns = ["donor_ID", "donations_2016"] gifts_2017_bydonor = gifts_2017.groupby("id")["amount"].count().reset_index() gifts_2017_bydonor.columns = ["donor_ID", "donations_2017"] # Add number of gifts in 2016 and 2017 to the basetable basetable = pd.merge(basetable, gifts_2016_bydonor, on="donor_ID", how="left") basetable = pd.merge(basetable, gifts_2017_bydonor, on="donor_ID", how="left") # Calculate the number of gifts in 2017 minus number of gifts in 2016 basetable.fillna(0) basetable["gifts_2017_min_2016"] = basetable["donations_2017"] - basetable["donations_2016"] print(basetable.head()) # ============================================================================= # Using evolution variables # ============================================================================= ''' Performance of evolution variables ''' # variables_regular = ["gender_F", "age", "donations_2017"] # variables_evolution = ["gender_F", "age", "donations_2017_min_2016"] # Select the evolution variables and fit the model X_evolution = basetable[variables_evolution] logreg.fit(X_evolution, y) # Make predictions and calculate the AUC predictions_evolution = logreg.predict_proba(X_evolution)[:,1] auc_evolution = roc_auc_score(y, predictions_evolution) # Print the respective AUC values(假設 auc_regular 已經算好了) print(round(auc_regular, 2)) # 0.6 print(round(auc_evolution, 2)) # 0.7 ''' Meaning of evolution: plot the pig table ''' # Discretize the variable in 5 bins and add to the basetable basetable["donations_2017_min_2016_disc"] = pd.qcut(basetable["donations_2017_min_2016"], 5) # Construct the predictor insight graph table pig_table = create_pig_table(basetable, "target", "donations_2017_min_2016_disc") # Plot the predictor insight graph plot_pig(pig_table, "donations_2017_min_2016_disc")
We use cookies to provide and improve our services. By using our site, you consent to our Cookies Policy. Accept Learn more