import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
import warnings
warnings.simplefilter(action="ignore")
warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 170)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df = pd.read_csv("E_Commerce.csv")

print("------------------- Shape -------------------")
print(df.shape)
print("------------------- Types -------------------")
print(df.dtypes)

------------------- Shape -------------------
(5630, 20)
------------------- Types -------------------
CustomerID                       int64
Churn                            int64
Tenure                         float64
PreferredLoginDevice            object
CityTier                         int64
WarehouseToHome                float64
PreferredPaymentMode            object
Gender                          object
HourSpendOnApp                 float64
NumberOfDeviceRegistered         int64
PreferedOrderCat                object
SatisfactionScore                int64
MaritalStatus                   object
NumberOfAddress                  int64
Complain                         int64
OrderAmountHikeFromlastYear    float64
CouponUsed                     float64
OrderCount                     float64
DaySinceLastOrder              float64
CashbackAmount                   int64
dtype: object

df.head(10)

df.describe().T

df.describe(include='O').T

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5366 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5379 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5375 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress              5630 non-null   int64  
 14  Complain                     5630 non-null   int64  
 15  OrderAmountHikeFromlastYear  5365 non-null   float64
 16  CouponUsed                   5374 non-null   float64
 17  OrderCount                   5372 non-null   float64
 18  DaySinceLastOrder            5323 non-null   float64
 19  CashbackAmount               5630 non-null   int64  
dtypes: float64(7), int64(8), object(5)
memory usage: 879.8+ KB

df.isnull().sum().sort_values(ascending=0)

DaySinceLastOrder              307
OrderAmountHikeFromlastYear    265
Tenure                         264
OrderCount                     258
CouponUsed                     256
HourSpendOnApp                 255
WarehouseToHome                251
CustomerID                       0
MaritalStatus                    0
Complain                         0
NumberOfAddress                  0
PreferedOrderCat                 0
SatisfactionScore                0
Churn                            0
NumberOfDeviceRegistered         0
Gender                           0
PreferredPaymentMode             0
CityTier                         0
PreferredLoginDevice             0
CashbackAmount                   0
dtype: int64

def grab_col_names(df, cat_th=10, car_th=20):
      # cat_cols, cat_but_car
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > car_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 5630
Variables: 20
cat_cols: 11
num_cols: 9
cat_but_car: 0
num_but_cat: 6

cat_cols

['PreferredLoginDevice',
 'PreferredPaymentMode',
 'Gender',
 'PreferedOrderCat',
 'MaritalStatus',
 'Churn',
 'CityTier',
 'HourSpendOnApp',
 'NumberOfDeviceRegistered',
 'SatisfactionScore',
 'Complain']

def cat_summary(dataframe, col_name):
    summary = pd.DataFrame({
        col_name: dataframe[col_name].value_counts().index,
        "Count": dataframe[col_name].value_counts().values,
        "Ratio": 100 * dataframe[col_name].value_counts().values / len(dataframe)
    })
    summary = summary.sort_values("Ratio", ascending=False).reset_index(drop=True)
    return summary

def plot_cat_summary(summaries, ncols=3, figsize=(15, 15)):
    nrows = (len(summaries) + ncols - 1) // ncols
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    axes = axes.flatten()

    for i, (col, summary) in enumerate(summaries.items()):
        ax = axes[i]
        bars = sns.barplot(x=col, y="Ratio", data=summary, ax=ax)
        ax.set_title(col, fontsize=14, pad=20)
        ax.set_xlabel("")
        ax.set_ylabel("Ratio (%)")
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

        # Add labels on top of each bar
        for bar in bars.patches:
            height = bar.get_height()
            width = bar.get_width()
            x = bar.get_x()
            category = summary[col][summary['Ratio'] == height].values[0]
            ratio = height
            count = summary[summary[col] == category]['Count'].values[0]
            ax.text(x + width/2., height,
                    f'{ratio:.2f}%\n({count})',
                    ha='center', va='bottom', rotation=0)

    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Assuming df and cat_cols are already defined
summaries = {}
for col in cat_cols:
    summary = cat_summary(df, col)
    summaries[col] = summary
    print(f"Summary for {col}:")
    print(summary)
    print("---------------------------------------------------------------")

# Plot the summaries
plot_cat_summary(summaries)

Summary for PreferredLoginDevice:
  PreferredLoginDevice  Count  Ratio
0         Mobile Phone   2765 49.112
1             Computer   1634 29.023
2                Phone   1231 21.865
---------------------------------------------------------------
Summary for PreferredPaymentMode:
  PreferredPaymentMode  Count  Ratio
0           Debit Card   2314 41.101
1          Credit Card   1501 26.661
2             E wallet    614 10.906
3                  UPI    414  7.353
4                  COD    365  6.483
5                   CC    273  4.849
6     Cash on Delivery    149  2.647
---------------------------------------------------------------
Summary for Gender:
   Gender  Count  Ratio
0    Male   3384 60.107
1  Female   2246 39.893
---------------------------------------------------------------
Summary for PreferedOrderCat:
     PreferedOrderCat  Count  Ratio
0  Laptop & Accessory   2050 36.412
1        Mobile Phone   1271 22.575
2             Fashion    826 14.671
3              Mobile    809 14.369
4             Grocery    410  7.282
5              Others    264  4.689
---------------------------------------------------------------
Summary for MaritalStatus:
  MaritalStatus  Count  Ratio
0       Married   2986 53.037
1        Single   1796 31.901
2      Divorced    848 15.062
---------------------------------------------------------------
Summary for Churn:
   Churn  Count  Ratio
0      0   4682 83.162
1      1    948 16.838
---------------------------------------------------------------
Summary for CityTier:
   CityTier  Count  Ratio
0         1   3666 65.115
1         3   1722 30.586
2         2    242  4.298
---------------------------------------------------------------
Summary for HourSpendOnApp:
   HourSpendOnApp  Count  Ratio
0           3.000   2687 47.726
1           2.000   1471 26.128
2           4.000   1176 20.888
3           1.000     35  0.622
4           0.000      3  0.053
5           5.000      3  0.053
---------------------------------------------------------------
Summary for NumberOfDeviceRegistered:
   NumberOfDeviceRegistered  Count  Ratio
0                         4   2377 42.220
1                         3   1699 30.178
2                         5    881 15.648
3                         2    276  4.902
4                         1    235  4.174
5                         6    162  2.877
---------------------------------------------------------------
Summary for SatisfactionScore:
   SatisfactionScore  Count  Ratio
0                  3   1698 30.160
1                  1   1164 20.675
2                  5   1108 19.680
3                  4   1074 19.076
4                  2    586 10.409
---------------------------------------------------------------
Summary for Complain:
   Complain  Count  Ratio
0         0   4026 71.510
1         1   1604 28.490
---------------------------------------------------------------

num_cols

['CustomerID',
 'Tenure',
 'WarehouseToHome',
 'NumberOfAddress',
 'OrderAmountHikeFromlastYear',
 'CouponUsed',
 'OrderCount',
 'DaySinceLastOrder',
 'CashbackAmount']

quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
df[num_cols].describe(quantiles)

def plot_summary(dataframe, numerical_cols, figsize=(15, 10), bins=30):
    # Calculate the number of rows and columns for the subplot grid
    n_cols = 3
    n_rows = (len(numerical_cols) - 1) // n_cols + 1

    # Create the subplot grid
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten()  # Flatten the 2D array of axes to 1D for easy indexing

    for i, col in enumerate(numerical_cols):
        ax = axes[i]
        sns.histplot(data=dataframe, x=col, kde=True, ax=ax, bins=bins)
        ax.set_title(col)
        ax.set_xlabel('')

    # Remove any unused subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# Assuming df and num_cols are already defined
plot_summary(df, num_cols)

def target_summary_with_num(dataframe, target, numerical_cols):
    return dataframe.groupby(target)[numerical_cols].mean()

result = target_summary_with_num(df, "Churn", num_cols)
result.round(2)

def target_summary_with_num(dataframe, target, numerical_cols):
    return dataframe.groupby(target)[numerical_cols].mean()

# Calculate the summary statistics
result = target_summary_with_num(df, "Churn", num_cols)

# Set up the plotting style
plt.style.use('seaborn-v0_8')

# Create subplots
fig, axs = plt.subplots(len(num_cols), 2, figsize=(15,35))

# Ensure axs is always 2D
if len(num_cols) == 1:
    axs = axs.reshape(1, -1)

# Plot each numerical variable
for i, col in enumerate(num_cols):
    # Pie chart
    churned = df[df['Churn'] == 1][col].sum()
    not_churned = df[df['Churn'] == 0][col].sum()
    axs[i, 0].pie([not_churned, churned], labels=['Not Churned', 'Churned'], autopct='%1.1f%%')
    axs[i, 0].set_title(f'Distribution of Total {col} by Churn Status')

    # Box plot
    sns.boxplot(x='Churn', y=col, data=df, ax=axs[i, 1])
    axs[i, 1].set_title(f'Distribution of {col} by Churn Status')
    axs[i, 1].set_xlabel('Churn')
    axs[i, 1].set_ylabel(col)

# Adjust layout and show plot
plt.tight_layout()
plt.show()

def target_summary_with_cat(dataframe, target, categorical_cols):
    fig, axes = plt.subplots(1, 2, figsize=(20, 6))
    
    # Define a color palette
    color_palette = sns.color_palette("Paired", 8)  # Adjust the number of colors as needed
    
    for i, categorical_col in enumerate(categorical_cols):
        summary = pd.DataFrame({
            "CHURN_SUM": dataframe.groupby(categorical_col)[target].sum(),
            "Count": dataframe[categorical_col].value_counts(),
            "Ratio": (100 * dataframe[categorical_col].value_counts() / len(dataframe)).round(2)
        }).sort_index()
        
        print(categorical_col)
        print(summary, end="\n\n\n")
        
        ax1 = axes[i]
        
        # Bar plot for Count with different colors
        bars = ax1.bar(summary.index, summary['Count'], alpha=0.8)
        for j, bar in enumerate(bars):
            bar.set_color(color_palette[j % len(color_palette)])
        
        ax1.set_xlabel(categorical_col)
        ax1.set_ylabel('Count', color='blue')
        ax1.tick_params(axis='x', rotation=45)
        ax1.tick_params(axis='y', labelcolor='blue')
        
        # Add Ratio labels on top of the Count bars
        for j, v in enumerate(summary['Count']):
            ax1.text(j, v, f"{summary['Ratio'].iloc[j]}%", ha='center', va='bottom')
        
        # Create a twin axis for CHURN_SUM
        ax2 = ax1.twinx()
        
        # Line plot for CHURN_SUM
        sns.lineplot(x=summary.index, y='CHURN_SUM', data=summary, ax=ax2, color='red', marker='o')
        ax2.set_ylabel(f'Sum {target}', color='red')
        ax2.tick_params(axis='y', labelcolor='red')
        
        ax1.set_title(f'Count, Ratio, and Sum {target} by {categorical_col}')
    
    plt.tight_layout()
    plt.show()

for i in range(0, len(cat_cols), 2):
    cols_to_plot = cat_cols[i:i+2]
    target_summary_with_cat(df, "Churn", cols_to_plot)

PreferredLoginDevice
                      CHURN_SUM  Count  Ratio
PreferredLoginDevice                         
Computer                    324   1634 29.020
Mobile Phone                348   2765 49.110
Phone                       276   1231 21.870


PreferredPaymentMode
                      CHURN_SUM  Count  Ratio
PreferredPaymentMode                         
CC                           59    273  4.850
COD                         105    365  6.480
Cash on Delivery             23    149  2.650
Credit Card                 193   1501 26.660
Debit Card                  356   2314 41.100
E wallet                    140    614 10.910
UPI                          72    414  7.350

Gender
        CHURN_SUM  Count  Ratio
Gender                         
Female        348   2246 39.890
Male          600   3384 60.110


PreferedOrderCat
                    CHURN_SUM  Count  Ratio
PreferedOrderCat                           
Fashion                   128    826 14.670
Grocery                    20    410  7.280
Laptop & Accessory        210   2050 36.410
Mobile                    220    809 14.370
Mobile Phone              350   1271 22.580
Others                     20    264  4.690

MaritalStatus
               CHURN_SUM  Count  Ratio
MaritalStatus                         
Divorced             124    848 15.060
Married              344   2986 53.040
Single               480   1796 31.900


Churn
       CHURN_SUM  Count  Ratio
Churn                         
0              0   4682 83.160
1            948    948 16.840

CityTier
          CHURN_SUM  Count  Ratio
CityTier                         
1               532   3666 65.120
2                48    242  4.300
3               368   1722 30.590


HourSpendOnApp
                CHURN_SUM  Count  Ratio
HourSpendOnApp                         
0.000                   0      3  0.050
1.000                   0     35  0.620
2.000                 232   1471 26.130
3.000                 460   2687 47.730
4.000                 198   1176 20.890
5.000                   0      3  0.050

NumberOfDeviceRegistered
                          CHURN_SUM  Count  Ratio
NumberOfDeviceRegistered                         
1                                22    235  4.170
2                                26    276  4.900
3                               254   1699 30.180
4                               392   2377 42.220
5                               198    881 15.650
6                                56    162  2.880


SatisfactionScore
                   CHURN_SUM  Count  Ratio
SatisfactionScore                         
1                        134   1164 20.670
2                         74    586 10.410
3                        292   1698 30.160
4                        184   1074 19.080
5                        264   1108 19.680

Complain
          CHURN_SUM  Count  Ratio
Complain                         
0               440   4026 71.510
1               508   1604 28.490

df[num_cols].corr()

# Correlation Matrix
f, ax = plt.subplots(figsize=[18, 8])

# Calculate the correlation matrix
corr_matrix = df[num_cols].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Draw the heatmap
sns.heatmap(corr_matrix, 
            mask=mask,  # Apply the mask to hide the upper triangle
            annot=True, 
            fmt=".2f", 
            ax=ax, 
            cmap="RdYlGn",
            cbar_kws={"shrink": .8},  # Shrink the colorbar a bit
            square=True,  # Ensure the cells are square
            linewidths=0.5)  # Add thin lines between cells

ax.set_title("Correlation of Numerical Variables", fontsize=18)
plt.tight_layout()  # Adjust the layout to prevent cutoff
plt.show()

df.isnull().sum().sort_values(ascending=0)

DaySinceLastOrder              307
OrderAmountHikeFromlastYear    265
Tenure                         264
OrderCount                     258
CouponUsed                     256
HourSpendOnApp                 255
WarehouseToHome                251
CustomerID                       0
MaritalStatus                    0
Complain                         0
NumberOfAddress                  0
PreferedOrderCat                 0
SatisfactionScore                0
Churn                            0
NumberOfDeviceRegistered         0
Gender                           0
PreferredPaymentMode             0
CityTier                         0
PreferredLoginDevice             0
CashbackAmount                   0
dtype: int64

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

na_columns = missing_values_table(df, na_name=True)

                             n_miss  ratio
DaySinceLastOrder               307  5.450
OrderAmountHikeFromlastYear     265  4.710
Tenure                          264  4.690
OrderCount                      258  4.580
CouponUsed                      256  4.550
HourSpendOnApp                  255  4.530
WarehouseToHome                 251  4.460

df["DaySinceLastOrder"].fillna(df["DaySinceLastOrder"].median(), inplace=True)
df["OrderAmountHikeFromlastYear"].fillna(df["OrderAmountHikeFromlastYear"].median(), inplace=True)
df["Tenure"].fillna(df["Tenure"].median(), inplace=True)
df["OrderCount"].fillna(df["OrderCount"].median(), inplace=True)
df["CouponUsed"].fillna(df["CouponUsed"].median(), inplace=True)
df["HourSpendOnApp"].fillna(df["HourSpendOnApp"].median(), inplace=True)
df["WarehouseToHome"].fillna(df["WarehouseToHome"].median(), inplace=True)

df.isnull().sum()

CustomerID                     0
Churn                          0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
Gender                         0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferedOrderCat               0
SatisfactionScore              0
MaritalStatus                  0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromlastYear    0
CouponUsed                     0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
dtype: int64

def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
for col in num_cols:
    print(col, check_outlier(df, col))

CustomerID False
Tenure False
WarehouseToHome True
NumberOfAddress False
OrderAmountHikeFromlastYear False
CouponUsed True
OrderCount False
DaySinceLastOrder True
CashbackAmount False

def replace_with_thresholds(dataframe, variable, q1=0.05, q3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1=0.05, q3=0.95)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for col in num_cols:
    replace_with_thresholds(df,col)

for col in num_cols:
    print(col, check_outlier(df, col))

CustomerID False
Tenure False
WarehouseToHome False
NumberOfAddress False
OrderAmountHikeFromlastYear False
CouponUsed False
OrderCount False
DaySinceLastOrder False
CashbackAmount False

cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 5630
Variables: 20
cat_cols: 11
num_cols: 9
cat_but_car: 0
num_but_cat: 6

from sklearn.preprocessing import LabelEncoder

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]
binary_cols

cat_cols = [col for col in cat_cols if col not in binary_cols and col not in ["Churn"]]

for col in binary_cols:
    df = label_encoder(df, col)

def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe


df = one_hot_encoder(df, cat_cols, drop_first=True)

df.head()

print(df.shape)

(5630, 43)

y = df["Churn"]
X = df.drop(["Churn"], axis=1)

def base_models(X, y, scoring="roc_auc"):
    print("Base Models...")
    classifiers = [('LR', LogisticRegression()),
                   ('KNN', KNeighborsClassifier()),
                   ("SVC", SVC()),
                   ("CART", DecisionTreeClassifier()),
                   ("RF", RandomForestClassifier()),
                   ('Adaboost', AdaBoostClassifier()),
                   ('GBM', GradientBoostingClassifier()),
                   ('XGBoost', XGBClassifier(eval_metric='logloss')),
                   ('LightGBM', LGBMClassifier(verbose=-1))   ]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
        print(f"{scoring}: {round(cv_results['test_score'].mean(), 4)} ({name}) ")

base_models(X, y)

Base Models...
roc_auc: 0.8272 (LR) 
roc_auc: 0.5129 (KNN) 
roc_auc: 0.5329 (SVC) 
roc_auc: 0.8351 (CART) 
roc_auc: 0.9774 (RF) 
roc_auc: 0.9121 (Adaboost) 
roc_auc: 0.9287 (GBM) 
roc_auc: 0.9795 (XGBoost) 
roc_auc: 0.9782 (LightGBM)

KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

DecisionTreeClassifier().get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

XGBClassifier(eval_metric='logloss').get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': 'logloss',
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

LGBMClassifier().get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

knn_params = {"n_neighbors": range(2, 10)}

cart_params = {'max_depth': range(1, 20),
               "min_samples_split": range(1, 20)}

rf_params = {"max_depth": [8, 15, None],
             "max_features": [5, 7, "auto"],
             "min_samples_split": [15, 20],
             "n_estimators": [200, 300]}

xgboost_params = {"learning_rate": [0.1, 0.01, None],
                  "max_depth": [5, 8, None],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 1, None]}

lightgbm_params = {"learning_rate": [0.01, 0.1],
                   "n_estimators": [200, 300, 500],
                   "colsample_bytree": [0.1, 0.5, 1]}

classifiers = [('KNN', KNeighborsClassifier(), knn_params),
               ("CART", DecisionTreeClassifier(), cart_params),
               ("RF", RandomForestClassifier(), rf_params),
               ('XGBoost', XGBClassifier(eval_metric='logloss'), xgboost_params),
               ('LightGBM', LGBMClassifier(verbose=-1), lightgbm_params)]

def hyperparameter_optimization(X, y, cv=3, scoring="roc_auc"):
    print("Hyperparameter Optimization....")
    best_models = {}
    for name, classifier, params in classifiers:
        print(f"########## {name} ##########")
        cv_results = cross_validate(classifier, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (Before): {round(cv_results['test_score'].mean(), 4)}")

        gs_best = GridSearchCV(classifier, params, cv=cv, n_jobs=-1, verbose=False).fit(X, y)
        final_model = classifier.set_params(**gs_best.best_params_)

        cv_results = cross_validate(final_model, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (After): {round(cv_results['test_score'].mean(), 4)}")
        print(f"{name} best params: {gs_best.best_params_}", end="\n\n")
        best_models[name] = final_model
    return best_models

best_models = hyperparameter_optimization(X, y)

Hyperparameter Optimization....
########## KNN ##########
roc_auc (Before): 0.5129
roc_auc (After): 0.5273
KNN best params: {'n_neighbors': 8}

########## CART ##########
roc_auc (Before): 0.8606
roc_auc (After): 0.8524
CART best params: {'max_depth': 17, 'min_samples_split': 4}

########## RF ##########
roc_auc (Before): 0.9786
roc_auc (After): 0.9626
RF best params: {'max_depth': 15, 'max_features': 7, 'min_samples_split': 15, 'n_estimators': 200}

########## XGBoost ##########
roc_auc (Before): 0.9795
roc_auc (After): 0.9804
XGBoost best params: {'colsample_bytree': 1, 'learning_rate': None, 'max_depth': 8, 'n_estimators': 300}

########## LightGBM ##########
roc_auc (Before): 0.9782
roc_auc (After): 0.9804
LightGBM best params: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 500}

def voting_classifier(best_models, X, y):
    print("Voting Classifier...")
    voting_clf = VotingClassifier(estimators=[('KNN', best_models["KNN"]), ('RF', best_models["RF"]),
                                              ('LightGBM', best_models["LightGBM"])],
                                  voting='soft').fit(X, y)
    cv_results = cross_validate(voting_clf, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
    print(f"Accuracy: {cv_results['test_accuracy'].mean()}")
    print(f"F1Score: {cv_results['test_f1'].mean()}")
    print(f"ROC_AUC: {cv_results['test_roc_auc'].mean()}")
    return voting_clf

voting_clf = voting_classifier(best_models, X, y)

Voting Classifier...
Accuracy: 0.9140195021543475
F1Score: 0.6035197385429875
ROC_AUC: 0.9606708693726915

model_LightGBM = LGBMClassifier(verbose=-1).fit(X, y)

model_XGBoost = XGBClassifier(eval_metric='logloss').fit(X, y)

model_LightGBM.feature_importances_

array([ 75, 291, 294,  71, 220, 281,  53,  99, 234, 359,  38,  33,  32,
        18,  36,  28,  44,  26,   4,  63,   5,  21,   9,  18,  79,  23,
        87,   0,  18,  16,  12,   0,  19,   7,  17,  24,  21,   7,  80,
        45,  79, 114])

def plot_importance(model, features, num=len(X)):
    # Create a DataFrame with feature importance values
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    
    # Calculate percentage of each feature's importance
    feature_imp['Percentage'] = feature_imp['Value'] / feature_imp['Value'].sum() * 100
    
    # Calculate total percentage
    total_percentage = feature_imp['Percentage'].sum()

    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    
    # Sort and select top features
    top_features = feature_imp.sort_values(by="Value", ascending=False)[0:num]
    
    # Create the bar plot
    ax = sns.barplot(x="Value", y="Feature", data=top_features)

    # Annotate the bars with percentage
    for i in range(top_features.shape[0]):
        ax.text(top_features['Value'].iloc[i], i, 
                f"{top_features['Percentage'].iloc[i]:.2f}%", 
                color='black', ha="left", va="center", fontsize=10)

    # Show total percentage at the end of the plot
    ax.text(0.5, -0.1, f"Total Percentage: {total_percentage:.2f}%", 
            horizontalalignment='center', verticalalignment='center', 
            fontsize=12, color='blue', transform=ax.transAxes)
    plt.title('Features')
    plt.tight_layout()
    plt.show()

plot_importance(model_LightGBM, X)

plot_importance(model_XGBoost, X)

df = pd.read_csv("E_Commerce.csv")

from scipy import stats
# 1. Data Preparation
categories = ['Laptop & Accessory', 'Mobile Phone', 'Mobile']

# Filter for 2-4 hours spent
df_2_4_hours = df[(df['HourSpendOnApp'] >= 2) & (df['HourSpendOnApp'] <= 4)]

# Further filter for specific categories
df_filtered = df_2_4_hours[df_2_4_hours['PreferedOrderCat'].isin(categories)]

# 2. Testing the hypothesis
def test_category_percentage(df_2_4_hours, df_filtered):
    total_2_4_hour_sessions = len(df_2_4_hours)
    filtered_sessions = len(df_filtered)
    observed_percentage = filtered_sessions / total_2_4_hour_sessions
    
    # Perform a binomial test
    p0 = 0.73  # hypothesized value is 73%
    p_value = stats.binom_test(filtered_sessions, total_2_4_hour_sessions, p0, alternative='two-sided')  # two-sided test
    
    return observed_percentage, p_value
    
# Perform the test
category_percentage, category_p_value = test_category_percentage(df_2_4_hours, df_filtered)

# 3. Interpret results
alpha = 0.01  # significance level

print(f"Percentage of 2-4 hour sessions in Electronic categories: {category_percentage:.2f}")
print(f"P-value: {category_p_value:.4f}")

if category_p_value > alpha:
    print("Failed to reject the null hypothesis. The percentage of categories is not significantly different from 73%.")
else:
    print("Rejected the null hypothesis. The percentage of categories is significantly different from 73%.")

# Additional insights
print(f"\nTotal number of 2-4 hour sessions: {len(df_2_4_hours)}")
print(f"Number of 2-4 hour sessions in Electronic categories: {len(df_filtered)}")

Percentage of 2-4 hour sessions in Electronic categories: 0.72
P-value: 0.1605
Failed to reject the null hypothesis. The percentage of categories is not significantly different from 73%.

Total number of 2-4 hour sessions: 5334
Number of 2-4 hour sessions in Electronic categories: 3848

from scipy.stats import chi2_contingency
from scipy.stats import chi2
from sklearn.preprocessing import LabelEncoder

# List of category variables
category_columns = [
    'PreferredLoginDevice', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat',
    'MaritalStatus', 'Churn', 'CityTier', 'HourSpendOnApp',
    'NumberOfDeviceRegistered', 'SatisfactionScore', 'Complain']

# Encode all categorical variables
df_encoded = df.copy()
for col in category_columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Function to calculate Cramér's V
def cramers_v(chi2_stat, n, dof):
    return np.sqrt(chi2_stat / (n * (min(dof) - 1)))

# Analyze association between 'HourSpendOnApp' and other categorical variables
results = {}
for col in category_columns:
    if col != 'HourSpendOnApp':  # Skip 'HourSpendOnApp' itself
        
        # Create contingency table between 'HourSpendOnApp' and the current variable
        contingency_table = pd.crosstab(df_encoded['HourSpendOnApp'], df_encoded[col])
        
        # Perform Chi-square test of independence
        chi2_stat, p, dof, expected = chi2_contingency(contingency_table)
        n = contingency_table.values.sum()
        
        # Check the p-value to determine if the variables are dependent
        if p < 0.05:
            dependency = 'Dependent'
        else:
            dependency = 'Independent'
            
        # Calculate Cramér's V to measure association strength
        cramer_v_stat = cramers_v(chi2_stat, n, contingency_table.shape)
        
        # Store results
        results[col] = {
            'Chi2 Stat': chi2_stat,
            'p-value': p,
            'Dependency': dependency,
            'Cramér\'s V': cramer_v_stat}
        
# Display the results
results_df = pd.DataFrame(results).T.sort_values(['Dependency', 'Cramér\'s V'], ascending=[True, False])
print(results_df)

                         Chi2 Stat p-value   Dependency Cramér's V
PreferedOrderCat          1838.706   0.000    Dependent      0.256
PreferredLoginDevice       601.795   0.000    Dependent      0.231
NumberOfDeviceRegistered  1416.454   0.000    Dependent      0.224
PreferredPaymentMode       424.060   0.000    Dependent      0.112
Churn                       16.002   0.014    Dependent      0.053
CityTier                    28.221   0.005    Dependent      0.050
Gender                       8.281   0.218  Independent      0.038
MaritalStatus               16.412   0.173  Independent      0.038
SatisfactionScore           15.214   0.914  Independent      0.026
Complain                     3.446   0.751  Independent      0.025

import statsmodels.api as sm
from statsmodels.formula.api import ols

def perform_anova_and_rsquared(df, categorical_var, numerical_vars, significance_level=0.05):
    results = {}
    for num_var in numerical_vars:
        
        # Remove rows with missing values for the current pair of variables
        valid_data = df[[categorical_var, num_var]].dropna()
        
        if len(valid_data[categorical_var].unique()) < 2:
            results[num_var] = {
                'F_statistic': np.nan,
                'p_value': np.nan,
                'R_squared': np.nan,
                'Dependency': 'N/A'}
            continue
            
        # Perform one-way ANOVA
        groups = [group for _, group in valid_data.groupby(categorical_var)[num_var]]
        f_statistic, p_value = stats.f_oneway(*groups)
        
        # Calculate R-squared
        model = ols(f'{num_var} ~ C({categorical_var})', data=valid_data).fit()
        r_squared = model.rsquared
        
        # Determine dependency
        dependency = 'Dependent' if p_value < significance_level else 'Independent'
        
        results[num_var] = {
            'F_statistic': f_statistic,
            'p_value': round(p_value, 6),
            'Dependency': dependency,
            'R_squared': r_squared}
    return results
    
# List of numerical variables
numerical_variables = [
    'Tenure', 'WarehouseToHome', 'NumberOfAddress', 'OrderAmountHikeFromlastYear',
    'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']

# Perform ANOVA and calculate R-squared
results = perform_anova_and_rsquared(df, 'HourSpendOnApp', numerical_variables)

# Print results
anova_results_df = pd.DataFrame(results).T.sort_values('R_squared', ascending = False)
print(anova_results_df)

                            F_statistic p_value   Dependency R_squared
CouponUsed                       40.294   0.000    Dependent     0.038
NumberOfAddress                  23.513   0.000    Dependent     0.021
CashbackAmount                   17.728   0.000    Dependent     0.016
OrderAmountHikeFromlastYear      14.936   0.000    Dependent     0.014
OrderCount                       13.623   0.000    Dependent     0.013
DaySinceLastOrder                 6.226   0.000    Dependent     0.006
WarehouseToHome                   4.069   0.001    Dependent     0.004
Tenure                            1.342   0.243  Independent     0.001

	count	mean	std	min	25%	50%	75%	max
CustomerID	5630.000	52815.500	1625.385	50001.000	51408.250	52815.500	54222.750	55630.000
Churn	5630.000	0.168	0.374	0.000	0.000	0.000	0.000	1.000
Tenure	5366.000	10.190	8.557	0.000	2.000	9.000	16.000	61.000
CityTier	5630.000	1.655	0.915	1.000	1.000	1.000	3.000	3.000
WarehouseToHome	5379.000	15.640	8.531	5.000	9.000	14.000	20.000	127.000
HourSpendOnApp	5375.000	2.932	0.722	0.000	2.000	3.000	3.000	5.000
NumberOfDeviceRegistered	5630.000	3.689	1.024	1.000	3.000	4.000	4.000	6.000
SatisfactionScore	5630.000	3.067	1.380	1.000	2.000	3.000	4.000	5.000
NumberOfAddress	5630.000	4.214	2.584	1.000	2.000	3.000	6.000	22.000
Complain	5630.000	0.285	0.451	0.000	0.000	0.000	1.000	1.000
OrderAmountHikeFromlastYear	5365.000	15.708	3.675	11.000	13.000	15.000	18.000	26.000
CouponUsed	5374.000	1.751	1.895	0.000	1.000	1.000	2.000	16.000
OrderCount	5372.000	3.008	2.940	1.000	1.000	2.000	3.000	16.000
DaySinceLastOrder	5323.000	4.543	3.654	0.000	2.000	3.000	7.000	46.000
CashbackAmount	5630.000	177.221	49.194	0.000	146.000	163.000	196.000	325.000

	CustomerID	Tenure	WarehouseToHome	NumberOfAddress	OrderAmountHikeFromlastYear	CouponUsed	OrderCount	DaySinceLastOrder	CashbackAmount
count	5630.000	5366.000	5379.000	5630.000	5365.000	5374.000	5372.000	5323.000	5630.000
mean	52815.500	10.190	15.640	4.214	15.708	1.751	3.008	4.543	177.221
std	1625.385	8.557	8.531	2.584	3.675	1.895	2.940	3.654	49.194
min	50001.000	0.000	5.000	1.000	11.000	0.000	1.000	0.000	0.000
5%	50282.450	0.000	6.000	1.000	11.000	0.000	1.000	0.000	123.000
10%	50563.900	1.000	7.000	2.000	12.000	0.000	1.000	1.000	127.000
20%	51126.800	1.000	8.000	2.000	12.000	1.000	1.000	1.000	141.000
30%	51689.700	4.000	9.000	2.000	13.000	1.000	1.000	2.000	148.000
40%	52252.600	6.000	11.000	3.000	14.000	1.000	2.000	3.000	154.000
50%	52815.500	9.000	14.000	3.000	15.000	1.000	2.000	3.000	163.000
60%	53378.400	11.000	15.000	4.000	16.000	2.000	2.000	4.000	173.000
70%	53941.300	14.000	17.000	5.000	17.000	2.000	2.000	7.000	187.000
80%	54504.200	18.000	23.000	6.000	19.000	2.000	4.000	8.000	209.000
90%	55067.100	23.000	30.000	8.000	21.000	4.000	7.000	9.000	260.000
95%	55348.550	27.000	33.000	10.000	23.000	6.000	9.000	11.000	292.000
99%	55573.710	30.000	35.000	11.000	25.000	9.000	15.000	15.000	318.710
max	55630.000	61.000	127.000	22.000	26.000	16.000	16.000	46.000	325.000

	CustomerID	Tenure	WarehouseToHome	NumberOfAddress	OrderAmountHikeFromlastYear	CouponUsed	OrderCount	DaySinceLastOrder	CashbackAmount
Churn
0	52829.460	11.500	15.350	4.160	15.720	1.760	3.050	4.810	180.630
1	52746.580	3.380	17.130	4.470	15.630	1.720	2.820	3.240	160.370

	CustomerID	Tenure	WarehouseToHome	NumberOfAddress	OrderAmountHikeFromlastYear	CouponUsed	OrderCount	DaySinceLastOrder	CashbackAmount
CustomerID	1.000	0.030	0.059	0.161	0.117	0.234	0.139	0.113	0.217
Tenure	0.030	1.000	-0.018	0.238	0.006	0.129	0.186	0.185	0.477
WarehouseToHome	0.059	-0.018	1.000	-0.011	0.039	-0.004	0.003	0.018	-0.010
NumberOfAddress	0.161	0.238	-0.011	1.000	0.016	0.042	-0.008	-0.065	0.187
OrderAmountHikeFromlastYear	0.117	0.006	0.039	0.016	1.000	0.033	0.023	0.006	0.018
CouponUsed	0.234	0.129	-0.004	0.042	0.033	1.000	0.745	0.359	0.286
OrderCount	0.139	0.186	0.003	-0.008	0.023	0.745	1.000	0.498	0.361
DaySinceLastOrder	0.113	0.185	0.018	-0.065	0.006	0.359	0.498	1.000	0.347
CashbackAmount	0.217	0.477	-0.010	0.187	0.018	0.286	0.361	0.347	1.000

Churn Analytics in E-commerce

Churn Analytics in E-commerce¶

Contents:¶

1. Overview¶

1.1. Definition¶

1.2. Importance of analyzing customer churn¶

2. Understanding the dataset¶

3. Exploratory Data Analysis¶

3.1. Univariate Analysis¶

3.1.1. Categorical variables¶

Comment 2¶

3.1.2. Numerical variables¶

3.2. Bivariate Analysis¶

4. Data processing and Feature Engineering¶

4.1. Handling Missing values¶

4.2. Outlier treatment¶

4.3. Encoding¶

5. Customer Churn Prediction¶

5.1. Base Model¶

5.2. Hyperparameter Optimization¶

5.3. Stacking & Ensemble Learning¶

5.4. Feature Importance¶

6. Recommendations¶

6.1. Short-term plans:¶

6.2. Long-term plans:¶

7. Discussion¶

Share on

	CustomerID	Churn	Tenure	PreferredLoginDevice	CityTier	WarehouseToHome	PreferredPaymentMode	Gender	HourSpendOnApp	NumberOfDeviceRegistered	PreferedOrderCat	SatisfactionScore	MaritalStatus	NumberOfAddress	Complain	OrderAmountHikeFromlastYear	CouponUsed	OrderCount	DaySinceLastOrder	CashbackAmount
0	50001	1	4.000	Mobile Phone	3	6.000	Debit Card	Female	3.000	3	Laptop & Accessory	2	Single	9	1	11.000	1.000	1.000	5.000	160
1	50002	1	NaN	Phone	1	8.000	UPI	Male	3.000	4	Mobile	3	Single	7	1	15.000	0.000	1.000	0.000	121
2	50003	1	NaN	Phone	1	30.000	Debit Card	Male	2.000	4	Mobile	3	Single	6	1	14.000	0.000	1.000	3.000	120
3	50004	1	0.000	Phone	3	15.000	Debit Card	Male	2.000	4	Laptop & Accessory	5	Single	8	0	23.000	0.000	1.000	3.000	134
4	50005	1	0.000	Phone	1	12.000	CC	Male	NaN	3	Mobile	5	Single	3	0	11.000	1.000	1.000	3.000	130
5	50006	1	0.000	Computer	1	22.000	Debit Card	Female	3.000	5	Mobile Phone	5	Single	2	1	22.000	4.000	6.000	7.000	139
6	50007	1	NaN	Phone	3	11.000	Cash on Delivery	Male	2.000	3	Laptop & Accessory	2	Divorced	4	0	14.000	0.000	1.000	0.000	121
7	50008	1	NaN	Phone	1	6.000	CC	Male	3.000	3	Mobile	2	Divorced	3	1	16.000	2.000	2.000	0.000	123
8	50009	1	13.000	Phone	3	9.000	E wallet	Male	NaN	4	Mobile	3	Divorced	2	1	14.000	0.000	1.000	2.000	127
9	50010	1	NaN	Phone	1	31.000	Debit Card	Male	2.000	5	Mobile	3	Single	2	0	12.000	1.000	1.000	1.000	123

	count	unique	top	freq
PreferredLoginDevice	5630	3	Mobile Phone	2765
PreferredPaymentMode	5630	7	Debit Card	2314
Gender	5630	2	Male	3384
PreferedOrderCat	5630	6	Laptop & Accessory	2050
MaritalStatus	5630	3	Married	2986