Bank Customer Churn: Class Imbalance Methods#

This notebook compares several imbalance-handling strategies for a binary churn classification problem.

The goal is not only to train a Logistic Regression model, but to write the workflow in a clean, repeatable, and professional machine learning style:

  • central configuration

  • train/test split before preprocessing

  • ColumnTransformer + Pipeline

  • model/experiment registry

  • one evaluation loop

  • clean result table

  • saved best full pipeline with joblib

import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import joblib

import numpy as np
import pandas as pd
import sklearn  

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline as SklearnPipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours

pd.set_option("display.max_columns", 100)
sns.set_theme(style="whitegrid")
sklearn.set_config(transform_output="pandas")  

Configuration#

CONFIG = {
    "data": {
        "path": "../data/Bank Customer Churn Prediction.csv",
        "target": "churn",
        "drop_columns": ["customer_id"],
        "test_size": 0.2,
        "stratify": True,
    },

    "global": {
        "seed": 42,
        "max_iter": 1000,
        "threshold": 0.5,
    },

    "evaluation": {
        "optimize_metric": "f1",
        "report_metrics": [
            "accuracy",
            "precision",
            "recall",
            "f1",
            "roc_auc",
            "pr_auc",
        ],
    },

    "artifacts": {
        "model_dir": "../models",
        "best_model_name": "best_churn_imbalance_pipeline.joblib",
        "results_dir": "../outputs",
        "results_name": "churn_imbalance_results.csv",
    },
}

Load and inspect data#

df = pd.read_csv(CONFIG["data"]["path"])

df.head()
customer_id credit_score country gender age tenure balance products_number credit_card active_member estimated_salary churn
0 15634602 619 France Female 42 2 0.00 1 1 1 101348.88 1
1 15647311 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0
2 15619304 502 France Female 42 8 159660.80 3 1 0 113931.57 1
3 15701354 699 France Female 39 1 0.00 2 0 0 93826.63 0
4 15737888 850 Spain Female 43 2 125510.82 1 1 1 79084.10 0
print("Shape:", df.shape)
df.info()
Shape: (10000, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB
missing_values = df.isna().sum().sort_values(ascending=False)
missing_values[missing_values > 0]
Series([], dtype: int64)

Target balance#

target = CONFIG["data"]["target"]

class_counts = df[target].value_counts().sort_index()
class_percentages = df[target].value_counts(normalize=True).sort_index() * 100

target_balance = pd.DataFrame({
    "count": class_counts,
    "percentage": class_percentages.round(2),
})

target_balance
count percentage
churn
0 7963 79.63
1 2037 20.37
plt.figure(figsize=(6, 4))
sns.barplot(
    x=class_counts.index,
    y=class_counts.values,
)

plt.title("Churn Class Distribution")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.xticks([0, 1], ["No churn", "Churn"])
plt.show()
../_images/3f0a19063ef3e50e99d9eccfcb5e3e5935c52ce7234a02090bbdd1ecb6d9737f.png

Prepare features and target#

X = df.drop(columns=CONFIG["data"]["drop_columns"] + [target])
y = df[target]

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)
Numeric features: ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary']
Categorical features: ['country', 'gender']
stratify_target = y if CONFIG["data"]["stratify"] else None

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=CONFIG["data"]["test_size"],
    random_state=CONFIG["global"]["seed"],
    stratify=stratify_target,
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print()
print("Train target balance:")
print(y_train.value_counts(normalize=True).round(3))
print()
print("Test target balance:")
print(y_test.value_counts(normalize=True).round(3))
Train shape: (8000, 10)
Test shape: (2000, 10)

Train target balance:
churn
0    0.796
1    0.204
Name: proportion, dtype: float64

Test target balance:
churn
0    0.796
1    0.204
Name: proportion, dtype: float64

Build preprocessing pipeline#

def build_preprocessor(numeric_features, categorical_features):
    numeric_transformer = SklearnPipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = SklearnPipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )


preprocessor = build_preprocessor(
    numeric_features=numeric_features,
    categorical_features=categorical_features,
)

Define imbalance experiments#

EXPERIMENTS = {
    "baseline": {
        "sampler": None,
        "model_params": {
            "class_weight": None,
        },
    },

    "random_over_sampler": {
        "sampler": RandomOverSampler(random_state=CONFIG["global"]["seed"]),
        "model_params": {
            "class_weight": None,
        },
    },

    "smote": {
        "sampler": SMOTE(random_state=CONFIG["global"]["seed"]),
        "model_params": {
            "class_weight": None,
        },
    },

    "adasyn": {
        "sampler": ADASYN(random_state=CONFIG["global"]["seed"]),
        "model_params": {
            "class_weight": None,
        },
    },

    "random_under_sampler": {
        "sampler": RandomUnderSampler(random_state=CONFIG["global"]["seed"]),
        "model_params": {
            "class_weight": None,
        },
    },

    "edited_nearest_neighbours": {
        "sampler": EditedNearestNeighbours(),
        "model_params": {
            "class_weight": None,
        },
    },

    "class_weight_balanced": {
        "sampler": None,
        "model_params": {
            "class_weight": "balanced",
        },
    },

    "class_weight_custom_1_to_10": {
        "sampler": None,
        "model_params": {
            "class_weight": {0: 1, 1: 10},
        },
    },
}
def build_model(preprocessor, sampler=None, model_params=None):
    model_params = model_params or {}

    classifier = LogisticRegression(
        max_iter=CONFIG["global"]["max_iter"],
        random_state=CONFIG["global"]["seed"],
        **model_params,
    )

    if sampler is None:
        return SklearnPipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("classifier", classifier),
            ]
        )

    return ImbPipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("sampler", sampler),
            ("classifier", classifier),
        ]
    )

Train and evaluate all experiments#

def evaluate_classifier(model, X_test, y_test, threshold=0.5):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "pr_auc": average_precision_score(y_test, y_proba),
    }


trained_models = {}
results = []

for experiment_name, experiment_config in EXPERIMENTS.items():
    model = build_model(
        preprocessor=preprocessor,
        sampler=experiment_config["sampler"],
        model_params=experiment_config["model_params"],
    )

    model.fit(X_train, y_train)

    metrics = evaluate_classifier(
        model=model,
        X_test=X_test,
        y_test=y_test,
        threshold=CONFIG["global"]["threshold"],
    )

    trained_models[experiment_name] = model
    results.append({
        "experiment": experiment_name,
        **metrics,
    })

results_df = (
    pd.DataFrame(results)
    .sort_values(CONFIG["evaluation"]["optimize_metric"], ascending=False)
    .reset_index(drop=True)
)

results_df
experiment accuracy precision recall f1 roc_auc pr_auc
0 random_over_sampler 0.7160 0.391069 0.710074 0.504363 0.778120 0.469027
1 smote 0.7175 0.390884 0.695332 0.500442 0.775364 0.465535
2 class_weight_balanced 0.7135 0.387228 0.700246 0.498688 0.777165 0.467919
3 random_under_sampler 0.7035 0.378906 0.714988 0.495319 0.774673 0.467613
4 adasyn 0.6975 0.373077 0.714988 0.490312 0.766827 0.463134
5 edited_nearest_neighbours 0.7925 0.489130 0.442260 0.464516 0.764393 0.472329
6 class_weight_custom_1_to_10 0.4600 0.263528 0.921376 0.409836 0.777334 0.459944
7 baseline 0.8080 0.589147 0.186732 0.283582 0.774758 0.478928

Model comparison#

metric_columns = CONFIG["evaluation"]["report_metrics"]

display(
    results_df.style.format({
        metric: "{:.3f}" for metric in metric_columns
    })
)
  experiment accuracy precision recall f1 roc_auc pr_auc
0 random_over_sampler 0.716 0.391 0.710 0.504 0.778 0.469
1 smote 0.718 0.391 0.695 0.500 0.775 0.466
2 class_weight_balanced 0.714 0.387 0.700 0.499 0.777 0.468
3 random_under_sampler 0.704 0.379 0.715 0.495 0.775 0.468
4 adasyn 0.698 0.373 0.715 0.490 0.767 0.463
5 edited_nearest_neighbours 0.792 0.489 0.442 0.465 0.764 0.472
6 class_weight_custom_1_to_10 0.460 0.264 0.921 0.410 0.777 0.460
7 baseline 0.808 0.589 0.187 0.284 0.775 0.479
plot_df = results_df.melt(
    id_vars="experiment",
    value_vars=["precision", "recall", "f1"],
    var_name="metric",
    value_name="score",
)

plt.figure(figsize=(10, 5))
sns.barplot(
    data=plot_df,
    x="score",
    y="experiment",
    hue="metric",
)

plt.title("Precision, Recall, and F1 by Imbalance Method")
plt.xlabel("Score")
plt.ylabel("Experiment")
plt.xlim(0, 1)
plt.legend(title="Metric")
plt.show()
../_images/cedddb00314718da8f90c644dff5445c1acea76bd620a762110fdc04fa78f592.png

Best model#

best_experiment = results_df.loc[0, "experiment"]
best_model = trained_models[best_experiment]

print("Best experiment:", best_experiment)
print("Optimized by:", CONFIG["evaluation"]["optimize_metric"])
print()
print(results_df.loc[0])
Best experiment: random_over_sampler
Optimized by: f1

experiment    random_over_sampler
accuracy                    0.716
precision                0.391069
recall                   0.710074
f1                       0.504363
roc_auc                   0.77812
pr_auc                   0.469027
Name: 0, dtype: object
y_proba_best = best_model.predict_proba(X_test)[:, 1]
y_pred_best = (y_proba_best >= CONFIG["global"]["threshold"]).astype(int)

print(classification_report(y_test, y_pred_best, target_names=["No churn", "Churn"]))
              precision    recall  f1-score   support

    No churn       0.91      0.72      0.80      1593
       Churn       0.39      0.71      0.50       407

    accuracy                           0.72      2000
   macro avg       0.65      0.71      0.65      2000
weighted avg       0.80      0.72      0.74      2000
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(6, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Predicted no churn", "Predicted churn"],
    yticklabels=["Actual no churn", "Actual churn"],
)

plt.title(f"Confusion Matrix - {best_experiment}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
../_images/f0052b71332bcfc526ac1784e4b77c5d619ed41dbccb2d36862bd0af3142768b.png
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

RocCurveDisplay.from_predictions(y_test, y_proba_best)
plt.title(f"ROC Curve - {best_experiment}")
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, y_proba_best)
plt.title(f"Precision-Recall Curve - {best_experiment}")
plt.show()
../_images/8a62a06b30785a0b1707d6f0b1163125db714dd4c79e30ae7cfe58d6e355a7bd.png ../_images/9139ec09d1e420279d4f8c44a8b8a78e02166a5fc3fc7cc8e2107207900c36f7.png

Save results and best model#

model_dir = Path(CONFIG["artifacts"]["model_dir"])
results_dir = Path(CONFIG["artifacts"]["results_dir"])

model_dir.mkdir(parents=True, exist_ok=True)
results_dir.mkdir(parents=True, exist_ok=True)

model_path = model_dir / CONFIG["artifacts"]["best_model_name"]
results_path = results_dir / CONFIG["artifacts"]["results_name"]

joblib.dump(best_model, model_path)
results_df.to_csv(results_path, index=False)

print("Saved best model to:", model_path)
print("Saved results to:", results_path)
Saved best model to: ..\models\best_churn_imbalance_pipeline.joblib
Saved results to: ..\outputs\churn_imbalance_results.csv
loaded_model = joblib.load(model_path)

sample_predictions = loaded_model.predict(X_test.head())
sample_predictions
array([0, 1, 0, 0, 0])