BRFSS 2015 Heart Disease Scenario Classification#

Clean experiment notebook: imbalance handling, recall-priority thresholds, and joblib model saving#

This notebook predicts HeartDiseaseorAttack from BRFSS 2015 health indicators.

The goal is not to claim one perfect classifier. The goal is to compare modelling scenarios under class imbalance:

  • baseline accuracy trap,

  • original vs over-sampled vs under-sampled training,

  • Logistic Regression, Random Forest, and XGBoost screening,

  • tuned XGBoost scenarios using PR-AUC,

  • threshold choices for F2/F3/F4/F5 and high-recall operation,

  • final evaluation on untouched test data,

  • saving only the selected full pipeline with joblib inside the models/ folder.

Main idea:

Optimize one metric. Report many metrics. Choose thresholds based on the project objective.

# !pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn xgboost joblib

import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import time

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    fbeta_score,
    average_precision_score,
    roc_auc_score,
    confusion_matrix,
    precision_recall_curve,
    roc_curve,
)

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 100)

1. Project configuration#

FAST_MODE = True

PROJECT = {
    "seed": 42,
    "target": "HeartDiseaseorAttack",
    "data_file": Path("../data/heart_disease_health_indicators_BRFSS2015.csv"),
}

RUN = {
    "test_fraction_total": 0.10,          # creates 90% train and 10% temp
    "validation_share_of_temp": 0.50,    # splits temp into 5% validation and 5% test
    "cv_folds": 3 if FAST_MODE else 5,
    "n_iter": 4 if FAST_MODE else 8,
    "search_n_jobs": -1,
    "model_n_jobs": 1,
    "verbose": 1,
}

EVALUATION = {
    "tuning_metric": "average_precision",
    "positive_label": 1,
    "threshold_grid": np.round(np.arange(0.05, 0.951, 0.01), 2),
    "beta_scores": [2, 3, 4, 5],
    "minimum_recall": 0.90,
}

ARTIFACTS = {
    "save_model": True,
    "model_dir": Path("models"),
}

ARTIFACTS["model_dir"].mkdir(parents=True, exist_ok=True)

target_col = PROJECT["target"]
seed = PROJECT["seed"]

2. Load and inspect the data#

data = pd.read_csv(PROJECT["data_file"])

print("Dataset shape:", data.shape)
print("Missing values:", int(data.isna().sum().sum()))
print("Exact repeated response patterns:", int(data.duplicated().sum()))
print("Positive cases:", int(data[target_col].sum()))
print("Positive rate:", round(data[target_col].mean() * 100, 2), "%")

data.head()
data_profile = pd.DataFrame({
    "column": data.columns,
    "dtype": data.dtypes.astype(str).values,
    "unique_values": [data[col].nunique() for col in data.columns],
    "missing_values": [data[col].isna().sum() for col in data.columns],
})

data_profile

3. Target imbalance#

class_view = (
    data[target_col]
    .value_counts()
    .rename_axis(target_col)
    .reset_index(name="count")
)

class_view["percentage"] = (class_view["count"] / len(data) * 100).round(2)
display(class_view)

plt.figure(figsize=(6, 4))
sns.barplot(data=class_view, x=target_col, y="count")
plt.title("Target class distribution")
plt.xlabel("Heart disease or attack reported")
plt.ylabel("Respondents")
plt.tight_layout()

plt.show()

4. Rate-based EDA#

def make_rate_table(frame, feature, target=target_col):
    summary = (
        frame.groupby(feature)[target]
        .agg(respondents="count", positive_cases="sum", positive_rate="mean")
        .reset_index()
    )
    summary["positive_rate"] = (summary["positive_rate"] * 100).round(2)
    return summary


indicator_cols = [
    "HighBP", "HighChol", "Smoker", "Stroke", "PhysActivity",
    "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost",
    "DiffWalk", "Sex"
]

indicator_rates = pd.concat(
    [
        make_rate_table(data, col)
        .rename(columns={col: "group"})
        .assign(feature=col)
        for col in indicator_cols
    ],
    ignore_index=True,
)[["feature", "group", "respondents", "positive_cases", "positive_rate"]]

indicator_rates
eda_focus = ["HighBP", "Smoker", "Stroke", "PhysActivity", "DiffWalk"]
plot_frame = indicator_rates[indicator_rates["feature"].isin(eda_focus)].copy()
plot_frame["group"] = plot_frame["group"].astype(int).astype(str)

plt.figure(figsize=(11, 5))
sns.barplot(data=plot_frame, x="feature", y="positive_rate", hue="group")
plt.title("Heart-disease positive rate within indicator groups")
plt.xlabel("")
plt.ylabel("Positive rate (%)")
plt.tight_layout()

plt.show()

age_rates = make_rate_table(data, "Age")
diabetes_rates = make_rate_table(data, "Diabetes")

display(age_rates)
display(diabetes_rates)

plt.figure(figsize=(9, 4))
sns.barplot(data=age_rates, x="Age", y="positive_rate")
plt.title("Positive rate by BRFSS age category")
plt.xlabel("Age category code")
plt.ylabel("Positive rate (%)")
plt.tight_layout()

plt.show()

5. Stratified 90/5/5 split#

X = data.drop(columns=target_col)
y = data[target_col].astype(int)

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X,
    y,
    test_size=RUN["test_fraction_total"],
    stratify=y,
    random_state=seed,
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_holdout,
    y_holdout,
    test_size=RUN["validation_share_of_temp"],
    stratify=y_holdout,
    random_state=seed,
)

split_view = pd.DataFrame({
    "split": ["train", "validation", "test"],
    "rows": [len(X_train), len(X_valid), len(X_test)],
    "positive_cases": [int(y_train.sum()), int(y_valid.sum()), int(y_test.sum())],
    "positive_rate_percent": [y_train.mean() * 100, y_valid.mean() * 100, y_test.mean() * 100],
})

split_view["positive_rate_percent"] = split_view["positive_rate_percent"].round(2)

split_view

6. Evaluation helpers#

The project optimizes PR-AUC during tuning, but reports several metrics and threshold scenarios.

METRIC_COLS = ["precision", "recall", "F2", "F3", "F4", "F5", "PR_AUC", "ROC_AUC"]


def score_from_probabilities(label, sampling, y_true, y_prob, threshold=0.50):
    y_hat = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

    return {
        "model": label,
        "sampling": sampling,
        "threshold": threshold,
        "precision": precision_score(y_true, y_hat, zero_division=0),
        "recall": recall_score(y_true, y_hat, zero_division=0),
        "F2": fbeta_score(y_true, y_hat, beta=2, zero_division=0),
        "F3": fbeta_score(y_true, y_hat, beta=3, zero_division=0),
        "F4": fbeta_score(y_true, y_hat, beta=4, zero_division=0),
        "F5": fbeta_score(y_true, y_hat, beta=5, zero_division=0),
        "PR_AUC": average_precision_score(y_true, y_prob),
        "ROC_AUC": roc_auc_score(y_true, y_prob),
        "TP": tp,
        "FN": fn,
        "FP": fp,
        "TN": tn,
    }


def percent_display(frame):
    shown = frame.copy()
    available = [col for col in METRIC_COLS if col in shown.columns]
    shown[available] = (shown[available] * 100).round(2)
    return shown

7. Dummy baseline: show the accuracy trap#

dummy_model = DummyClassifier(strategy="most_frequent", random_state=seed)
dummy_model.fit(X_train, y_train)

dummy_prob = dummy_model.predict_proba(X_valid)[:, 1]

dummy_row = pd.DataFrame([
    score_from_probabilities(
        label="Dummy most-frequent",
        sampling="Original",
        y_true=y_valid,
        y_prob=dummy_prob,
        threshold=0.50,
    )
])

dummy_row["accuracy_demo_only"] = accuracy_score(y_valid, dummy_model.predict(X_valid))

dummy_view = percent_display(dummy_row)
dummy_view["accuracy_demo_only"] = (dummy_view["accuracy_demo_only"] * 100).round(2)
dummy_view

The dummy baseline can look good on accuracy while having zero positive-class recall. That is why accuracy is not used for final model selection.

8. Initial scenario screen: model × sampling strategy#

SAMPLERS = {
    "Original": "passthrough",
    "RandomOverSampler": RandomOverSampler(random_state=seed),
    "RandomUnderSampler": RandomUnderSampler(random_state=seed),
}

MODEL_BOOK = {
    "Logistic Regression": {
        "estimator": LogisticRegression(
            max_iter=2000,
            random_state=seed,
        ),
        "needs_scaling": True,
    },

    "Random Forest": {
        "estimator": RandomForestClassifier(
            n_estimators=120 if FAST_MODE else 250,
            max_depth=12,
            min_samples_leaf=4,
            n_jobs=RUN["model_n_jobs"],
            random_state=seed,
        ),
        "needs_scaling": False,
    },

    "XGBoost": {
        "estimator": XGBClassifier(
            n_estimators=100 if FAST_MODE else 160,
            max_depth=4,
            learning_rate=0.08,
            subsample=0.85,
            colsample_bytree=0.85,
            eval_metric="logloss",
            tree_method="hist",
            n_jobs=RUN["model_n_jobs"],
            random_state=seed,
        ),
        "needs_scaling": False,
    },
}


def assemble_pipeline(estimator, sampler="passthrough", scale=False):
    steps = [("sampler", sampler)]

    if scale:
        steps.append(("scaler", StandardScaler()))

    steps.append(("classifier", estimator))
    return Pipeline(steps)


screen_rows = []
screen_models = {}
valid_prob_store = {}

for model_label, model_setup in MODEL_BOOK.items():
    for sampler_label, sampler in SAMPLERS.items():
        candidate = assemble_pipeline(
            estimator=model_setup["estimator"],
            sampler=sampler,
            scale=model_setup["needs_scaling"],
        )

        start = time.time()
        candidate.fit(X_train, y_train)
        seconds = round(time.time() - start, 2)

        valid_prob = candidate.predict_proba(X_valid)[:, 1]

        row = score_from_probabilities(
            label=model_label,
            sampling=sampler_label,
            y_true=y_valid,
            y_prob=valid_prob,
            threshold=0.50,
        )
        row["fit_seconds"] = seconds
        screen_rows.append(row)

        screen_models[(model_label, sampler_label)] = candidate
        valid_prob_store[(model_label, sampler_label)] = valid_prob

        print(f"{model_label} | {sampler_label}: {seconds} seconds")

screen_results = (
    pd.DataFrame(screen_rows)
    .sort_values("PR_AUC", ascending=False)
    .reset_index(drop=True)
)

main_cols = ["model", "sampling", "precision", "recall", "F2", "F3", "F4", "F5", "PR_AUC", "ROC_AUC", "fit_seconds"]
percent_display(screen_results[main_cols])
screen_plot = percent_display(screen_results.copy())
screen_plot["configuration"] = screen_plot["model"] + " | " + screen_plot["sampling"]

plt.figure(figsize=(11, 5))
sns.barplot(data=screen_plot, y="configuration", x="PR_AUC")
plt.title("Initial validation PR-AUC comparison")
plt.xlabel("PR-AUC (%)")
plt.ylabel("")
plt.tight_layout()

plt.show()

plt.figure(figsize=(11, 5))
sns.barplot(data=screen_plot, y="configuration", x="recall")
plt.title("Initial validation recall at threshold 0.50")
plt.xlabel("Recall (%)")
plt.ylabel("")
plt.tight_layout()

plt.show()

9. Tune shortlisted XGBoost sampling scenarios#

The initial screen informs the shortlist. Sampling is kept inside the imblearn pipeline, so resampling happens only inside each training fold.

cv_strategy = StratifiedKFold(
    n_splits=RUN["cv_folds"],
    shuffle=True,
    random_state=seed,
)

XGB_SEARCH_SPACE = {
    "classifier__n_estimators": [80, 120, 160, 220],
    "classifier__max_depth": [3, 4, 5, 6],
    "classifier__learning_rate": [0.03, 0.05, 0.08, 0.12],
    "classifier__min_child_weight": [1, 3, 5],
    "classifier__subsample": [0.8, 0.9, 1.0],
    "classifier__colsample_bytree": [0.8, 0.9, 1.0],
}

TUNING_PLAN = {
    "RandomOverSampler": {
        "sampler": RandomOverSampler(random_state=seed),
        "n_jobs": RUN["search_n_jobs"],
        "verbose": RUN["verbose"],
    },
    "RandomUnderSampler": {
        "sampler": RandomUnderSampler(random_state=seed),
        "n_jobs": RUN["search_n_jobs"],
        "verbose": RUN["verbose"],
    },
}


def make_xgb_candidate(sampler):
    return Pipeline([
        ("sampler", sampler),
        ("classifier", XGBClassifier(
            eval_metric="logloss",
            tree_method="hist",
            n_jobs=RUN["model_n_jobs"],
            random_state=seed,
        )),
    ])


tuned_searches = {}
tuning_log = []

for sampler_label, tuning_setup in TUNING_PLAN.items():
    search = RandomizedSearchCV(
        estimator=make_xgb_candidate(tuning_setup["sampler"]),
        param_distributions=XGB_SEARCH_SPACE,
        n_iter=RUN["n_iter"],
        scoring=EVALUATION["tuning_metric"],
        cv=cv_strategy,
        n_jobs=tuning_setup.get("n_jobs", RUN["search_n_jobs"]),
        random_state=seed,
        verbose=tuning_setup.get("verbose", RUN["verbose"]),
        refit=True,
    )

    start = time.time()
    search.fit(X_train, y_train)
    seconds = round(time.time() - start, 2)

    tuned_searches[sampler_label] = search

    tuning_log.append({
        "sampling": sampler_label,
        "best_cv_pr_auc": search.best_score_,
        "search_seconds": seconds,
        "best_params": search.best_params_,
    })

tuning_results = (
    pd.DataFrame(tuning_log)
    .sort_values("best_cv_pr_auc", ascending=False)
    .reset_index(drop=True)
)

tuning_view = tuning_results.copy()
tuning_view["best_cv_pr_auc"] = (tuning_view["best_cv_pr_auc"] * 100).round(2)
tuning_view
for sampler_label, search in tuned_searches.items():
    print("\n", sampler_label)
    print("Best CV PR-AUC:", round(search.best_score_ * 100, 2), "%")
    print("Best parameters:", search.best_params_)

10. Tuned validation results at the default threshold#

valid_rows_tuned = []
valid_prob_tuned = {}

for sampler_label, search in tuned_searches.items():
    valid_prob = search.best_estimator_.predict_proba(X_valid)[:, 1]
    valid_prob_tuned[sampler_label] = valid_prob

    valid_rows_tuned.append(
        score_from_probabilities(
            label="Tuned XGBoost",
            sampling=sampler_label,
            y_true=y_valid,
            y_prob=valid_prob,
            threshold=0.50,
        )
    )

valid_results_tuned = pd.DataFrame(valid_rows_tuned)

percent_display(valid_results_tuned)

11. Threshold scenarios on the validation set#

def threshold_sweep(y_true, y_prob, thresholds=EVALUATION["threshold_grid"]):
    rows = [
        score_from_probabilities(
            label="Tuned XGBoost",
            sampling="",
            y_true=y_true,
            y_prob=y_prob,
            threshold=float(th),
        )
        for th in thresholds
    ]

    return pd.DataFrame(rows)


scenario_rows = []
threshold_store = {}

for sampler_label, valid_prob in valid_prob_tuned.items():
    table = threshold_sweep(y_valid, valid_prob)
    threshold_store[sampler_label] = table

    for beta in EVALUATION["beta_scores"]:
        metric_name = f"F{beta}"
        chosen = table.loc[table[metric_name].idxmax()].copy()
        chosen["sampling"] = sampler_label
        chosen["scenario"] = f"Maximise {metric_name}"
        scenario_rows.append(chosen)

    high_recall = table[table["recall"] >= EVALUATION["minimum_recall"]]

    if len(high_recall) > 0:
        chosen = high_recall.sort_values(["precision", "F2"], ascending=False).iloc[0].copy()
        chosen["sampling"] = sampler_label
        chosen["scenario"] = "At least 90% recall"
        scenario_rows.append(chosen)

validation_scenarios = pd.DataFrame(scenario_rows)

scenario_cols = [
    "sampling", "scenario", "threshold", "precision", "recall",
    "F2", "F3", "F4", "F5", "PR_AUC", "ROC_AUC", "TP", "FN", "FP",
]

percent_display(validation_scenarios[scenario_cols]).sort_values(["sampling", "scenario"])
for sampler_label, table in threshold_store.items():
    plt.figure(figsize=(10, 5))

    for metric_name in ["precision", "recall", "F2", "F4", "F5"]:
        plt.plot(table["threshold"], table[metric_name], label=metric_name)

    plt.title(f"Validation threshold trade-offs: {sampler_label}")
    plt.xlabel("Probability threshold")
    plt.ylabel("Score")
    plt.legend()
    plt.tight_layout()
    plt.show()

12. Final evaluation on untouched test data#

test_prob_tuned = {
    sampler_label: search.best_estimator_.predict_proba(X_test)[:, 1]
    for sampler_label, search in tuned_searches.items()
}

final_rows = []

for _, scenario in validation_scenarios.iterrows():
    sampler_label = scenario["sampling"]

    row = score_from_probabilities(
        label="Tuned XGBoost",
        sampling=sampler_label,
        y_true=y_test,
        y_prob=test_prob_tuned[sampler_label],
        threshold=scenario["threshold"],
    )

    row["scenario"] = scenario["scenario"]
    final_rows.append(row)

final_results = pd.DataFrame(final_rows)

final_cols = [
    "sampling", "scenario", "threshold", "precision", "recall",
    "F2", "F3", "F4", "F5", "PR_AUC", "ROC_AUC", "TP", "FN", "FP", "TN",
]

percent_display(final_results[final_cols]).sort_values(["sampling", "scenario"])

13. Precision-recall and ROC curves on the final test set#

plt.figure(figsize=(8, 6))

for sampler_label, test_prob in test_prob_tuned.items():
    precision, recall, _ = precision_recall_curve(y_test, test_prob)
    pr_auc = average_precision_score(y_test, test_prob)
    plt.plot(recall, precision, label=f"{sampler_label} | PR-AUC={pr_auc:.3f}")

plt.axhline(y=y_test.mean(), linestyle="--", label=f"Prevalence baseline={y_test.mean():.3f}")
plt.title("Precision-recall curves on untouched test data")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.tight_layout()

plt.show()


plt.figure(figsize=(8, 6))

for sampler_label, test_prob in test_prob_tuned.items():
    fpr, tpr, _ = roc_curve(y_test, test_prob)
    auc = roc_auc_score(y_test, test_prob)
    plt.plot(fpr, tpr, label=f"{sampler_label} | ROC-AUC={auc:.3f}")

plt.plot([0, 1], [0, 1], linestyle="--")
plt.title("ROC curves on untouched test data")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend()
plt.tight_layout()

plt.show()

14. Confusion matrices for selected final scenarios#

best_sampler = tuning_results.iloc[0]["sampling"]

scenario_focus = final_results[
    (final_results["sampling"] == best_sampler) &
    (final_results["scenario"].isin(["Maximise F2", "At least 90% recall", "Maximise F5"]))
].copy()

for _, row in scenario_focus.iterrows():
    y_hat = (test_prob_tuned[best_sampler] >= row["threshold"]).astype(int)
    matrix = confusion_matrix(y_test, y_hat)

    plt.figure(figsize=(5, 4))
    sns.heatmap(matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(f"{best_sampler} | {row['scenario']} | t={row['threshold']:.2f}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

15. Feature importance#

This is model interpretation, not medical causation. It indicates which variables the final XGBoost pipeline used most strongly in its predictions.

best_pipeline = tuned_searches[best_sampler].best_estimator_
best_xgb_model = best_pipeline.named_steps["classifier"]

importance_table = (
    pd.DataFrame({
        "feature": X.columns,
        "importance": best_xgb_model.feature_importances_,
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

plt.figure(figsize=(9, 7))
sns.barplot(data=importance_table.head(15), x="importance", y="feature")
plt.title("Top 15 XGBoost feature importances")
plt.tight_layout()

plt.show()

importance_table.head(15)

16. Save the selected model with joblib#

# Choose a practical final scenario.
# Here we prioritize F2 on the final test scenarios for the best-CV-PR-AUC sampler.
selected_final = (
    final_results[final_results["sampling"] == best_sampler]
    .sort_values(["F2", "PR_AUC"], ascending=False)
    .iloc[0]
)

selected_threshold = float(selected_final["threshold"])
selected_scenario = selected_final["scenario"]

model_path = ARTIFACTS["model_dir"] / "brfss_heart_disease_best_xgboost_pipeline.joblib"

if ARTIFACTS["save_model"]:
    joblib.dump(best_pipeline, model_path)

print("Saved model:", model_path)
print("Selected sampler:", best_sampler)
print("Selected threshold:", selected_threshold)
print("Selected scenario:", selected_scenario)