import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pd.options.display.max_columns = None


def prepare_customer_data(data):

    # filled missing incomes with median
    data['Income'] = data['Income'].fillna(data['Income'].median())

    # remove unnecessary columns
    data_prep = data.drop(columns=['ID','Z_CostContact','Z_Revenue'])

    # replace Education with numeric categories
    data_prep['Education'] = data_prep['Education'].replace(to_replace={'Basic':0, 'Graduation':1,'Master':2, '2n Cycle':3, 'PhD':4})

    # replace Marital Status with numeric categories
    data_prep['Marital_Status'] = data_prep['Marital_Status'].replace(to_replace={'Single':0, 'Alone':0, 'YOLO':0, 'Absurd':0, 'Together':1,'Married':2, 'Divorced':3, 'Widow':4})

    # convert date column to Pandas datetime & add new column with integer number of days (models don't work with datetime)
    data_prep['Dt_Customer'] = pd.to_datetime(data_prep['Dt_Customer'])
    data_prep['DaysCust'] = (data_prep['Dt_Customer'].max() - data_prep['Dt_Customer']).dt.days.astype('int16')

    # remove original Dt_Customer column
    data_prep = data_prep.drop(columns='Dt_Customer')

    # add column classifying Top 25% customers (high value customers)
    data_prep['Top25%'] = np.where(data_prep['MntTotal'] >= data_prep['MntTotal'].quantile(q=0.75), 1, 0)

    # list of column names for predictors
    predictors = ['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
           'Teenhome', 'Recency','NumDealsPurchases', 'NumWebPurchases',
           'NumCatalogPurchases', 'NumStorePurchases', 'NumTotalPurchases',
           'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
           'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'DaysCust']

    # column name for target
    target = 'Top25%'
    
    # split into a training and testing set
    train, test = train_test_split(data_prep)
    
    # create scaled train and test sets
    ss = StandardScaler()
    ss.fit(train[predictors])
    scaled_train = ss.transform(train[predictors])
    scaled_test = ss.transform(test[predictors])

    return train, test, predictors, target, scaled_train, scaled_test


def create_scaled_model(model_type,train_data,test_data,scaled_train_data,scaled_test_data,predictors,target):

    clf = model_type
    clf.fit(scaled_train_data, train_data[target])
    predictions = clf.predict(scaled_test_data)
    accuracy = metrics.accuracy_score(test_data[target], predictions)
    cm = metrics.confusion_matrix(test_data[target], predictions)
    sns.heatmap(cm, annot=True, fmt='.0f')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Top25%')
    class1_error_rate = cm[1][0] / (cm[1][0] + cm[1][1])
    class1_error_rate
    class0_error_rate = cm[0][1] / (cm[0][1] + cm[0][0])
    class0_error_rate
    print(model_type)
    print('Accuracy:',accuracy)
    print('Class 1 Error Rate:', class1_error_rate, '(i.e., wrong on customers who are in top 25%)')
    print('Class 0 Error Rate:', class0_error_rate, '(i.e., wrong on customers who are not in top 25%)')
    plt.show()
    return class1_error_rate, class0_error_rate, accuracy


marketing_data = pd.read_csv('marketing_campaign.csv')
train, test, predictors, target, scaled_train, scaled_test = prepare_customer_data(marketing_data)

create_scaled_model(KNeighborsClassifier(),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(KNeighborsClassifier(weights='distance'),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(DecisionTreeClassifier(),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(DecisionTreeClassifier(class_weight='balanced'),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(LogisticRegression(),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(LogisticRegression(class_weight='balanced'),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(RandomForestClassifier(),train,test,scaled_train,scaled_test,predictors,target)
create_scaled_model(RandomForestClassifier(class_weight='balanced'),train,test,scaled_train,scaled_test,predictors,target)

KNeighborsClassifier()
Accuracy: 0.8714285714285714
Class 1 Error Rate: 0.2440944881889764 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09468822170900693 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(weights='distance')
Accuracy: 0.8857142857142857
Class 1 Error Rate: 0.2283464566929134 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.08083140877598152 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier()
Accuracy: 0.9053571428571429
Class 1 Error Rate: 0.14960629921259844 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.07852193995381063 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced')
Accuracy: 0.8821428571428571
Class 1 Error Rate: 0.2047244094488189 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09237875288683603 (i.e., wrong on customers who are not in top 25%)

LogisticRegression()
Accuracy: 0.8803571428571428
Class 1 Error Rate: 0.2283464566929134 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.08775981524249422 (i.e., wrong on customers who are not in top 25%)

LogisticRegression(class_weight='balanced')
Accuracy: 0.8642857142857143
Class 1 Error Rate: 0.06299212598425197 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.15704387990762125 (i.e., wrong on customers who are not in top 25%)

RandomForestClassifier()
Accuracy: 0.9214285714285714
Class 1 Error Rate: 0.11023622047244094 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.06928406466512702 (i.e., wrong on customers who are not in top 25%)

RandomForestClassifier(class_weight='balanced')
Accuracy: 0.9214285714285714
Class 1 Error Rate: 0.11811023622047244 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.06697459584295612 (i.e., wrong on customers who are not in top 25%)

(0.11811023622047244, 0.06697459584295612, 0.9214285714285714)


k_neighbors = [1,3,5,7,9,11,13,15,17,19]
c1er_list = []
c0er_list = []
accuracy_list = []

for k in k_neighbors:
    c1er, c0er, accuracy =create_scaled_model(KNeighborsClassifier(weights='distance',n_neighbors=k),train,test,scaled_train,scaled_test,predictors,target)
    c1er_list.append(c1er)
    c0er_list.append(c0er)
    accuracy_list.append(accuracy)

KNeighborsClassifier(n_neighbors=1, weights='distance')
Accuracy: 0.8803571428571428
Class 1 Error Rate: 0.291970802919708 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.06382978723404255 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=3, weights='distance')
Accuracy: 0.8803571428571428
Class 1 Error Rate: 0.30656934306569344 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.0591016548463357 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(weights='distance')
Accuracy: 0.8964285714285715
Class 1 Error Rate: 0.2773722627737226 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.04728132387706856 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=7, weights='distance')
Accuracy: 0.875
Class 1 Error Rate: 0.3357664233576642 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.05673758865248227 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=9, weights='distance')
Accuracy: 0.8857142857142857
Class 1 Error Rate: 0.3284671532846715 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.04491725768321513 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=11, weights='distance')
Accuracy: 0.8928571428571429
Class 1 Error Rate: 0.30656934306569344 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.0425531914893617 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=13, weights='distance')
Accuracy: 0.8892857142857142
Class 1 Error Rate: 0.31386861313868614 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.04491725768321513 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=15, weights='distance')
Accuracy: 0.8875
Class 1 Error Rate: 0.3284671532846715 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.0425531914893617 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=17, weights='distance')
Accuracy: 0.8857142857142857
Class 1 Error Rate: 0.31386861313868614 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.04964539007092199 (i.e., wrong on customers who are not in top 25%)

KNeighborsClassifier(n_neighbors=19, weights='distance')
Accuracy: 0.8839285714285714
Class 1 Error Rate: 0.3284671532846715 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.04728132387706856 (i.e., wrong on customers who are not in top 25%)


plt.plot(k_neighbors,c1er_list,label='Class 1 Error Rate')
plt.plot(k_neighbors,c0er_list,label='Class 0 Error Rate')
plt.title('K Neighbors Tuning Experiment: Error Rates')
plt.xlabel('n_neighbors (k) =')
plt.legend()
plt.show()

plt.plot(k_neighbors,accuracy_list,label='Accuracy')
plt.title('K Neighbors Tuning Experiment: Overall Accuracy')
plt.xlabel('n_neighbors (k) =')
plt.show()


depth = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
c1er_list = []
c0er_list = []
accuracy_list = []

for d in depth:
    c1er, c0er, accuracy =create_scaled_model(DecisionTreeClassifier(class_weight='balanced',max_depth = d),train,test,scaled_train,scaled_test,predictors,target)
    c1er_list.append(c1er)
    c0er_list.append(c0er)
    accuracy_list.append(accuracy)

DecisionTreeClassifier(class_weight='balanced', max_depth=1)
Accuracy: 0.8142857142857143
Class 1 Error Rate: 0.08759124087591241 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.21749408983451538 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=2)
Accuracy: 0.8821428571428571
Class 1 Error Rate: 0.0948905109489051 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.12529550827423167 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=3)
Accuracy: 0.8821428571428571
Class 1 Error Rate: 0.06569343065693431 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.1347517730496454 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=4)
Accuracy: 0.8875
Class 1 Error Rate: 0.08759124087591241 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.12056737588652482 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=5)
Accuracy: 0.8928571428571429
Class 1 Error Rate: 0.13138686131386862 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09929078014184398 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=6)
Accuracy: 0.8625
Class 1 Error Rate: 0.11678832116788321 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.14420803782505912 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=7)
Accuracy: 0.8946428571428572
Class 1 Error Rate: 0.12408759124087591 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09929078014184398 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=8)
Accuracy: 0.8875
Class 1 Error Rate: 0.17518248175182483 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09219858156028368 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=9)
Accuracy: 0.8803571428571428
Class 1 Error Rate: 0.20437956204379562 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09219858156028368 (i.e., wrong on customers who are not in top 25%)

DecisionTreeClassifier(class_weight='balanced', max_depth=10)
Accuracy: 0.875
Class 1 Error Rate: 0.21897810218978103 (i.e., wrong on customers who are in top 25%)
Class 0 Error Rate: 0.09456264775413711 (i.e., wrong on customers who are not in top 25%)


plt.plot(depth,c1er_list,label='Class 1 Error Rate')
plt.plot(depth,c0er_list,label='Class 0 Error Rate')
plt.title('Decision Tree Depth Tuning Experiment: Error Rates')
plt.xlabel('depth')
plt.legend()
plt.show()

plt.plot(depth,accuracy_list,label='Accuracy')
plt.title('Decision Tree Depth Tuning Experiment: Overall Accuracy')
plt.xlabel('depth')
plt.show()

Predicting High-Opportunity Customers (top 25%)¶

Jennifer Smith¶

June 23, 2022¶

Project Overview¶

Target Feature¶

Predictor Features¶

Data Preparation¶

Data Prep Function¶

Model creation & evaultation build¶

Data loaded and models run¶

Parameter Tuning Experiment 1: K Neighbors¶

Parameter Tuning Experiment 2: Decision Tree Max Depth¶

Interpreting Results¶