Forest Cover Type Classification with XGBoost (Original Version)

Forest Cover Type Classification with XGBoost (Original Version)#

Dataset: Forest Cover Type Kaggle

# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt

# Machine learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

from xgboost import XGBClassifier

# Utility
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("../data/forestcover.csv")
print(df.columns.tolist())
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40', 'Cover_Type']
# Make column names consistent and Python-friendly
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)
print(df.columns.tolist())
['elevation', 'aspect', 'slope', 'horizontal_distance_to_hydrology', 'vertical_distance_to_hydrology', 'horizontal_distance_to_roadways', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 'horizontal_distance_to_fire_points', 'wilderness_area1', 'wilderness_area2', 'wilderness_area3', 'wilderness_area4', 'soil_type1', 'soil_type2', 'soil_type3', 'soil_type4', 'soil_type5', 'soil_type6', 'soil_type7', 'soil_type8', 'soil_type9', 'soil_type10', 'soil_type11', 'soil_type12', 'soil_type13', 'soil_type14', 'soil_type15', 'soil_type16', 'soil_type17', 'soil_type18', 'soil_type19', 'soil_type20', 'soil_type21', 'soil_type22', 'soil_type23', 'soil_type24', 'soil_type25', 'soil_type26', 'soil_type27', 'soil_type28', 'soil_type29', 'soil_type30', 'soil_type31', 'soil_type32', 'soil_type33', 'soil_type34', 'soil_type35', 'soil_type36', 'soil_type37', 'soil_type38', 'soil_type39', 'soil_type40', 'cover_type']
print(df.shape)
print(df.dtypes)
print(df.info())
print(df.describe().T)
(581012, 55)
elevation                             int64
aspect                                int64
slope                                 int64
horizontal_distance_to_hydrology      int64
vertical_distance_to_hydrology        int64
horizontal_distance_to_roadways       int64
hillshade_9am                         int64
hillshade_noon                        int64
hillshade_3pm                         int64
horizontal_distance_to_fire_points    int64
wilderness_area1                      int64
wilderness_area2                      int64
wilderness_area3                      int64
wilderness_area4                      int64
soil_type1                            int64
soil_type2                            int64
soil_type3                            int64
soil_type4                            int64
soil_type5                            int64
soil_type6                            int64
soil_type7                            int64
soil_type8                            int64
soil_type9                            int64
soil_type10                           int64
soil_type11                           int64
soil_type12                           int64
soil_type13                           int64
soil_type14                           int64
soil_type15                           int64
soil_type16                           int64
soil_type17                           int64
soil_type18                           int64
soil_type19                           int64
soil_type20                           int64
soil_type21                           int64
soil_type22                           int64
soil_type23                           int64
soil_type24                           int64
soil_type25                           int64
soil_type26                           int64
soil_type27                           int64
soil_type28                           int64
soil_type29                           int64
soil_type30                           int64
soil_type31                           int64
soil_type32                           int64
soil_type33                           int64
soil_type34                           int64
soil_type35                           int64
soil_type36                           int64
soil_type37                           int64
soil_type38                           int64
soil_type39                           int64
soil_type40                           int64
cover_type                            int64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   elevation                           581012 non-null  int64
 1   aspect                              581012 non-null  int64
 2   slope                               581012 non-null  int64
 3   horizontal_distance_to_hydrology    581012 non-null  int64
 4   vertical_distance_to_hydrology      581012 non-null  int64
 5   horizontal_distance_to_roadways     581012 non-null  int64
 6   hillshade_9am                       581012 non-null  int64
 7   hillshade_noon                      581012 non-null  int64
 8   hillshade_3pm                       581012 non-null  int64
 9   horizontal_distance_to_fire_points  581012 non-null  int64
 10  wilderness_area1                    581012 non-null  int64
 11  wilderness_area2                    581012 non-null  int64
 12  wilderness_area3                    581012 non-null  int64
 13  wilderness_area4                    581012 non-null  int64
 14  soil_type1                          581012 non-null  int64
 15  soil_type2                          581012 non-null  int64
 16  soil_type3                          581012 non-null  int64
 17  soil_type4                          581012 non-null  int64
 18  soil_type5                          581012 non-null  int64
 19  soil_type6                          581012 non-null  int64
 20  soil_type7                          581012 non-null  int64
 21  soil_type8                          581012 non-null  int64
 22  soil_type9                          581012 non-null  int64
 23  soil_type10                         581012 non-null  int64
 24  soil_type11                         581012 non-null  int64
 25  soil_type12                         581012 non-null  int64
 26  soil_type13                         581012 non-null  int64
 27  soil_type14                         581012 non-null  int64
 28  soil_type15                         581012 non-null  int64
 29  soil_type16                         581012 non-null  int64
 30  soil_type17                         581012 non-null  int64
 31  soil_type18                         581012 non-null  int64
 32  soil_type19                         581012 non-null  int64
 33  soil_type20                         581012 non-null  int64
 34  soil_type21                         581012 non-null  int64
 35  soil_type22                         581012 non-null  int64
 36  soil_type23                         581012 non-null  int64
 37  soil_type24                         581012 non-null  int64
 38  soil_type25                         581012 non-null  int64
 39  soil_type26                         581012 non-null  int64
 40  soil_type27                         581012 non-null  int64
 41  soil_type28                         581012 non-null  int64
 42  soil_type29                         581012 non-null  int64
 43  soil_type30                         581012 non-null  int64
 44  soil_type31                         581012 non-null  int64
 45  soil_type32                         581012 non-null  int64
 46  soil_type33                         581012 non-null  int64
 47  soil_type34                         581012 non-null  int64
 48  soil_type35                         581012 non-null  int64
 49  soil_type36                         581012 non-null  int64
 50  soil_type37                         581012 non-null  int64
 51  soil_type38                         581012 non-null  int64
 52  soil_type39                         581012 non-null  int64
 53  soil_type40                         581012 non-null  int64
 54  cover_type                          581012 non-null  int64
dtypes: int64(55)
memory usage: 243.8 MB
None
                                       count         mean          std  \
elevation                           581012.0  2959.365301   279.984734   
aspect                              581012.0   155.656807   111.913721   
slope                               581012.0    14.103704     7.488242   
horizontal_distance_to_hydrology    581012.0   269.428217   212.549356   
vertical_distance_to_hydrology      581012.0    46.418855    58.295232   
horizontal_distance_to_roadways     581012.0  2350.146611  1559.254870   
hillshade_9am                       581012.0   212.146049    26.769889   
hillshade_noon                      581012.0   223.318716    19.768697   
hillshade_3pm                       581012.0   142.528263    38.274529   
horizontal_distance_to_fire_points  581012.0  1980.291226  1324.195210   
wilderness_area1                    581012.0     0.448865     0.497379   
wilderness_area2                    581012.0     0.051434     0.220882   
wilderness_area3                    581012.0     0.436074     0.495897   
wilderness_area4                    581012.0     0.063627     0.244087   
soil_type1                          581012.0     0.005217     0.072039   
soil_type2                          581012.0     0.012952     0.113066   
soil_type3                          581012.0     0.008301     0.090731   
soil_type4                          581012.0     0.021335     0.144499   
soil_type5                          581012.0     0.002749     0.052356   
soil_type6                          581012.0     0.011316     0.105775   
soil_type7                          581012.0     0.000181     0.013442   
soil_type8                          581012.0     0.000308     0.017550   
soil_type9                          581012.0     0.001974     0.044387   
soil_type10                         581012.0     0.056168     0.230245   
soil_type11                         581012.0     0.021359     0.144579   
soil_type12                         581012.0     0.051584     0.221186   
soil_type13                         581012.0     0.030001     0.170590   
soil_type14                         581012.0     0.001031     0.032092   
soil_type15                         581012.0     0.000005     0.002272   
soil_type16                         581012.0     0.004897     0.069804   
soil_type17                         581012.0     0.005890     0.076518   
soil_type18                         581012.0     0.003268     0.057077   
soil_type19                         581012.0     0.006921     0.082902   
soil_type20                         581012.0     0.015936     0.125228   
soil_type21                         581012.0     0.001442     0.037950   
soil_type22                         581012.0     0.057439     0.232681   
soil_type23                         581012.0     0.099399     0.299197   
soil_type24                         581012.0     0.036622     0.187833   
soil_type25                         581012.0     0.000816     0.028551   
soil_type26                         581012.0     0.004456     0.066605   
soil_type27                         581012.0     0.001869     0.043193   
soil_type28                         581012.0     0.001628     0.040318   
soil_type29                         581012.0     0.198356     0.398762   
soil_type30                         581012.0     0.051927     0.221879   
soil_type31                         581012.0     0.044175     0.205483   
soil_type32                         581012.0     0.090392     0.286743   
soil_type33                         581012.0     0.077716     0.267725   
soil_type34                         581012.0     0.002773     0.052584   
soil_type35                         581012.0     0.003255     0.056957   
soil_type36                         581012.0     0.000205     0.014310   
soil_type37                         581012.0     0.000513     0.022641   
soil_type38                         581012.0     0.026803     0.161508   
soil_type39                         581012.0     0.023762     0.152307   
soil_type40                         581012.0     0.015060     0.121791   
cover_type                          581012.0     2.051471     1.396504   

                                       min     25%     50%     75%     max  
elevation                           1859.0  2809.0  2996.0  3163.0  3858.0  
aspect                                 0.0    58.0   127.0   260.0   360.0  
slope                                  0.0     9.0    13.0    18.0    66.0  
horizontal_distance_to_hydrology       0.0   108.0   218.0   384.0  1397.0  
vertical_distance_to_hydrology      -173.0     7.0    30.0    69.0   601.0  
horizontal_distance_to_roadways        0.0  1106.0  1997.0  3328.0  7117.0  
hillshade_9am                          0.0   198.0   218.0   231.0   254.0  
hillshade_noon                         0.0   213.0   226.0   237.0   254.0  
hillshade_3pm                          0.0   119.0   143.0   168.0   254.0  
horizontal_distance_to_fire_points     0.0  1024.0  1710.0  2550.0  7173.0  
wilderness_area1                       0.0     0.0     0.0     1.0     1.0  
wilderness_area2                       0.0     0.0     0.0     0.0     1.0  
wilderness_area3                       0.0     0.0     0.0     1.0     1.0  
wilderness_area4                       0.0     0.0     0.0     0.0     1.0  
soil_type1                             0.0     0.0     0.0     0.0     1.0  
soil_type2                             0.0     0.0     0.0     0.0     1.0  
soil_type3                             0.0     0.0     0.0     0.0     1.0  
soil_type4                             0.0     0.0     0.0     0.0     1.0  
soil_type5                             0.0     0.0     0.0     0.0     1.0  
soil_type6                             0.0     0.0     0.0     0.0     1.0  
soil_type7                             0.0     0.0     0.0     0.0     1.0  
soil_type8                             0.0     0.0     0.0     0.0     1.0  
soil_type9                             0.0     0.0     0.0     0.0     1.0  
soil_type10                            0.0     0.0     0.0     0.0     1.0  
soil_type11                            0.0     0.0     0.0     0.0     1.0  
soil_type12                            0.0     0.0     0.0     0.0     1.0  
soil_type13                            0.0     0.0     0.0     0.0     1.0  
soil_type14                            0.0     0.0     0.0     0.0     1.0  
soil_type15                            0.0     0.0     0.0     0.0     1.0  
soil_type16                            0.0     0.0     0.0     0.0     1.0  
soil_type17                            0.0     0.0     0.0     0.0     1.0  
soil_type18                            0.0     0.0     0.0     0.0     1.0  
soil_type19                            0.0     0.0     0.0     0.0     1.0  
soil_type20                            0.0     0.0     0.0     0.0     1.0  
soil_type21                            0.0     0.0     0.0     0.0     1.0  
soil_type22                            0.0     0.0     0.0     0.0     1.0  
soil_type23                            0.0     0.0     0.0     0.0     1.0  
soil_type24                            0.0     0.0     0.0     0.0     1.0  
soil_type25                            0.0     0.0     0.0     0.0     1.0  
soil_type26                            0.0     0.0     0.0     0.0     1.0  
soil_type27                            0.0     0.0     0.0     0.0     1.0  
soil_type28                            0.0     0.0     0.0     0.0     1.0  
soil_type29                            0.0     0.0     0.0     0.0     1.0  
soil_type30                            0.0     0.0     0.0     0.0     1.0  
soil_type31                            0.0     0.0     0.0     0.0     1.0  
soil_type32                            0.0     0.0     0.0     0.0     1.0  
soil_type33                            0.0     0.0     0.0     0.0     1.0  
soil_type34                            0.0     0.0     0.0     0.0     1.0  
soil_type35                            0.0     0.0     0.0     0.0     1.0  
soil_type36                            0.0     0.0     0.0     0.0     1.0  
soil_type37                            0.0     0.0     0.0     0.0     1.0  
soil_type38                            0.0     0.0     0.0     0.0     1.0  
soil_type39                            0.0     0.0     0.0     0.0     1.0  
soil_type40                            0.0     0.0     0.0     0.0     1.0  
cover_type                             1.0     1.0     2.0     2.0     7.0  
missing_counts = df.isna().sum()
missing_percent = (df.isna().mean() * 100).round(2)

missing_table = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": missing_percent
}).sort_values("missing_count", ascending=False)

print(missing_table.head(20))

total_missing = df.isna().sum().sum()
print("\nTotal missing values:", total_missing)
             missing_count  missing_percent
elevation                0              0.0
soil_type28              0              0.0
soil_type17              0              0.0
soil_type18              0              0.0
soil_type19              0              0.0
soil_type20              0              0.0
soil_type21              0              0.0
soil_type22              0              0.0
soil_type23              0              0.0
soil_type24              0              0.0
soil_type25              0              0.0
soil_type26              0              0.0
soil_type27              0              0.0
soil_type29              0              0.0
soil_type15              0              0.0
soil_type30              0              0.0
soil_type31              0              0.0
soil_type32              0              0.0
soil_type33              0              0.0
soil_type34              0              0.0

Total missing values: 0
duplicate_count = df.duplicated().sum()
print("Duplicate rows:", duplicate_count)
Duplicate rows: 0
target_col = "cover_type"

print("\nTarget value counts:")
print(df[target_col].value_counts().sort_index())

print("\nTarget proportions:")
print((df[target_col].value_counts(normalize=True).sort_index() * 100).round(2))
Target value counts:
cover_type
1    211840
2    283301
3     35754
4      2747
5      9493
6     17367
7     20510
Name: count, dtype: int64

Target proportions:
cover_type
1    36.46
2    48.76
3     6.15
4     0.47
5     1.63
6     2.99
7     3.53
Name: proportion, dtype: float64
# XGBoost expects class labels from 0 to number_of_classes - 1.
# Original Cover_Type labels are 1 to 7.
df[target_col] = df[target_col] - 1

print("\nTarget values after shifting from 1-7 to 0-6:")
print(df[target_col].value_counts().sort_index())
Target values after shifting from 1-7 to 0-6:
cover_type
0    211840
1    283301
2     35754
3      2747
4      9493
5     17367
6     20510
Name: count, dtype: int64
feature_cols = [col for col in df.columns if col != target_col]

# Wilderness and soil columns are already one-hot encoded.
wilderness_cols = [col for col in feature_cols if col.startswith("wilderness_area")]
soil_cols = [col for col in feature_cols if col.startswith("soil_type")]

# Continuous numeric columns are the remaining feature columns.
binary_encoded_cols = wilderness_cols + soil_cols
continuous_cols = [col for col in feature_cols if col not in binary_encoded_cols]
plt.figure(figsize=(8, 5))
df[target_col].value_counts().sort_index().plot(kind="bar")
plt.title("Cover Type Class Distribution")
plt.xlabel("Cover Type Class (0-6)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
../_images/80ff29ce69af09ea2f41efa8f6d52c5dc13885b3e3d8b095bc8130f3bbc633db.png
X = df.drop(columns=[target_col])
y = df[target_col]
print("X shape:", X.shape)
print("y shape:", y.shape)
X shape: (581012, 54)
y shape: (581012,)
# First cut:
# 80% train+validation, 20% test

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y
)
# Second split:
# From train_val, create 75% train and 25% validation.

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=0.25,
    random_state=42,
    stratify=y_train_val
)
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)
Train shape: (348606, 54) (348606,)
Validation shape: (116203, 54) (116203,)
Test shape: (116203, 54) (116203,)
preprocessor = ColumnTransformer(
    transformers=[
        ("scale_continuous", StandardScaler(), continuous_cols),
        ("pass_binary_encoded", "passthrough", binary_encoded_cols)
    ],
    remainder="drop"
)
print(preprocessor)
ColumnTransformer(transformers=[('scale_continuous', StandardScaler(),
                                 ['elevation', 'aspect', 'slope',
                                  'horizontal_distance_to_hydrology',
                                  'vertical_distance_to_hydrology',
                                  'horizontal_distance_to_roadways',
                                  'hillshade_9am', 'hillshade_noon',
                                  'hillshade_3pm',
                                  'horizontal_distance_to_fire_points']),
                                ('pass_binary_encoded', 'passthrough',
                                 ['wilderness_area1', 'wil...
                                  'soil_type1', 'soil_type2', 'soil_type3',
                                  'soil_type4', 'soil_type5', 'soil_type6',
                                  'soil_type7', 'soil_type8', 'soil_type9',
                                  'soil_type10', 'soil_type11', 'soil_type12',
                                  'soil_type13', 'soil_type14', 'soil_type15',
                                  'soil_type16', 'soil_type17', 'soil_type18',
                                  'soil_type19', 'soil_type20', 'soil_type21',
                                  'soil_type22', 'soil_type23', 'soil_type24',
                                  'soil_type25', 'soil_type26', ...])])
xgb_baseline = XGBClassifier(
    objective="multi:softprob",
    num_class=7,
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)
baseline_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", xgb_baseline)
    ]
)
print("Training Baseline XGBoost Model")

baseline_pipeline.fit(X_train, y_train)

val_pred_baseline = baseline_pipeline.predict(X_val)

baseline_accuracy = accuracy_score(y_val, val_pred_baseline)
baseline_f1_macro = f1_score(y_val, val_pred_baseline, average="macro")
baseline_precision_macro = precision_score(y_val, val_pred_baseline, average="macro")
baseline_recall_macro = recall_score(y_val, val_pred_baseline, average="macro")
Training Baseline XGBoost Model
print("\nBaseline Validation Metrics:")
print("Accuracy:", baseline_accuracy)
print("F1 Macro:", baseline_f1_macro)
print("Precision Macro:", baseline_precision_macro)
print("Recall Macro:", baseline_recall_macro)

print("\nBaseline Classification Report:")
print(classification_report(y_val, val_pred_baseline))
Baseline Validation Metrics:
Accuracy: 0.8688243849126098
F1 Macro: 0.8519775932545374
Precision Macro: 0.8843506886108257
Recall Macro: 0.8302273974629332

Baseline Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85     42368
           1       0.86      0.90      0.88     56660
           2       0.90      0.91      0.90      7151
           3       0.90      0.88      0.89       549
           4       0.87      0.55      0.67      1899
           5       0.85      0.82      0.84      3474
           6       0.95      0.91      0.93      4102

    accuracy                           0.87    116203
   macro avg       0.88      0.83      0.85    116203
weighted avg       0.87      0.87      0.87    116203
xgb_for_tuning = XGBClassifier(
    objective="multi:softprob",
    num_class=7,
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)
tuning_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", xgb_for_tuning)
    ]
)

param_distributions = {
    "model__n_estimators": [200, 300, 500, 700],
    "model__max_depth": [4, 6, 8, 10],
    "model__learning_rate": [0.03, 0.05, 0.08, 0.1],
    "model__subsample": [0.7, 0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "model__min_child_weight": [1, 3, 5, 7],
    "model__gamma": [0, 0.1, 0.3, 0.5],
    "model__reg_alpha": [0, 0.01, 0.1],
    "model__reg_lambda": [1, 1.5, 2, 3]
}
random_search = RandomizedSearchCV(
    estimator=tuning_pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    scoring="f1_macro",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
print("Starting Hyperparameter Tuning")
print("=" * 80)

random_search.fit(X_train, y_train)

print("\nBest CV Score:")
print(random_search.best_score_)

print("\nBest Parameters:")
print(random_search.best_params_)
Starting Hyperparameter Tuning
================================================================================
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best CV Score:
0.9208403188731179

Best Parameters:
{'model__subsample': 0.9, 'model__reg_lambda': 1.5, 'model__reg_alpha': 0.01, 'model__n_estimators': 700, 'model__min_child_weight': 3, 'model__max_depth': 10, 'model__learning_rate': 0.05, 'model__gamma': 0, 'model__colsample_bytree': 0.8}
best_model = random_search.best_estimator_

val_pred_tuned = best_model.predict(X_val)

tuned_accuracy = accuracy_score(y_val, val_pred_tuned)
tuned_f1_macro = f1_score(y_val, val_pred_tuned, average="macro")
tuned_precision_macro = precision_score(y_val, val_pred_tuned, average="macro")
tuned_recall_macro = recall_score(y_val, val_pred_tuned, average="macro")
print("Tuned Model Validation Metrics")

print("Accuracy:", tuned_accuracy)
print("F1 Macro:", tuned_f1_macro)
print("Precision Macro:", tuned_precision_macro)
print("Recall Macro:", tuned_recall_macro)

print("\nTuned Validation Classification Report:")
print(classification_report(y_val, val_pred_tuned))
Tuned Model Validation Metrics
Accuracy: 0.9509393044929993
F1 Macro: 0.9324396508506769
Precision Macro: 0.939969531757413
Recall Macro: 0.9253948395025402

Tuned Validation Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.95     42368
           1       0.95      0.97      0.96     56660
           2       0.95      0.96      0.96      7151
           3       0.91      0.89      0.90       549
           4       0.91      0.84      0.87      1899
           5       0.93      0.92      0.93      3474
           6       0.97      0.96      0.97      4102

    accuracy                           0.95    116203
   macro avg       0.94      0.93      0.93    116203
weighted avg       0.95      0.95      0.95    116203
comparison_df = pd.DataFrame({
    "model": ["Baseline XGBoost", "Tuned XGBoost"],
    "accuracy": [baseline_accuracy, tuned_accuracy],
    "f1_macro": [baseline_f1_macro, tuned_f1_macro],
    "precision_macro": [baseline_precision_macro, tuned_precision_macro],
    "recall_macro": [baseline_recall_macro, tuned_recall_macro]
})
print("Baseline vs Tuned Model Comparison")
print(comparison_df.round(4))
Baseline vs Tuned Model Comparison
              model  accuracy  f1_macro  precision_macro  recall_macro
0  Baseline XGBoost    0.8688    0.8520           0.8844        0.8302
1     Tuned XGBoost    0.9509    0.9324           0.9400        0.9254
best_params = random_search.best_params_

# Remove "model__" prefix because now we pass parameters directly to XGBClassifier
clean_best_params = {
    key.replace("model__", ""): value
    for key, value in best_params.items()
}

final_xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=7,
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    **clean_best_params
)

final_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", final_xgb)
    ]
)
print("Training Final Model on Train + Validation")

final_pipeline.fit(X_train_val, y_train_val)
Training Final Model on Train + Validation
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scale_continuous',
                                                  StandardScaler(),
                                                  ['elevation', 'aspect',
                                                   'slope',
                                                   'horizontal_distance_to_hydrology',
                                                   'vertical_distance_to_hydrology',
                                                   'horizontal_distance_to_roadways',
                                                   'hillshade_9am',
                                                   'hillshade_noon',
                                                   'hillshade_3pm',
                                                   'horizontal_distance_to_fire_points']),
                                                 ('pass_binary_encoded',
                                                  'passt...
                               feature_types=None, feature_weights=None,
                               gamma=0, grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=0.05,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=10, max_leaves=None,
                               min_child_weight=3, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=700, n_jobs=-1, num_class=7, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
test_pred = final_pipeline.predict(X_test)

test_accuracy = accuracy_score(y_test, test_pred)
test_f1_macro = f1_score(y_test, test_pred, average="macro")
test_precision_macro = precision_score(y_test, test_pred, average="macro")
test_recall_macro = recall_score(y_test, test_pred, average="macro")
print("Final Test Set Metrics")
print("=" * 80)

print("Accuracy:", test_accuracy)
print("F1 Macro:", test_f1_macro)
print("Precision Macro:", test_precision_macro)
print("Recall Macro:", test_recall_macro)

print("\nFinal Test Classification Report:")
print(classification_report(y_test, test_pred))
Final Test Set Metrics
================================================================================
Accuracy: 0.9549839505004174
F1 Macro: 0.9370123739255286
Precision Macro: 0.9474308399788806
Recall Macro: 0.9273853036142882

Final Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     42368
           1       0.95      0.97      0.96     56661
           2       0.96      0.96      0.96      7151
           3       0.91      0.87      0.89       549
           4       0.94      0.86      0.90      1899
           5       0.93      0.92      0.93      3473
           6       0.97      0.97      0.97      4102

    accuracy                           0.95    116203
   macro avg       0.95      0.93      0.94    116203
weighted avg       0.96      0.95      0.95    116203
cm = confusion_matrix(y_test, test_pred)

plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(values_format="d")
plt.title("Final XGBoost Confusion Matrix")
plt.tight_layout()
plt.show()
<Figure size 800x600 with 0 Axes>
../_images/741b54a58222bda9c5fd0982a769bf299eb1d40d9fe4b8431dbb3662b157773e.png
feature_names = (
    continuous_cols +
    binary_encoded_cols
)

xgb_model = final_pipeline.named_steps["model"]

importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": xgb_model.feature_importances_
}).sort_values("importance", ascending=False)
print("=" * 80)
print("Top 20 Feature Importances")

print(importance_df.head(20))

plt.figure(figsize=(10, 8))
top_20 = importance_df.head(20).sort_values("importance")
plt.barh(top_20["feature"], top_20["importance"])
plt.title("Top 20 XGBoost Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
================================================================================
Top 20 Feature Importances
             feature  importance
13  wilderness_area4    0.150222
17        soil_type4    0.055444
52       soil_type39    0.054769
35       soil_type22    0.048671
15        soil_type2    0.040670
0          elevation    0.035064
25       soil_type12    0.034678
51       soil_type38    0.034195
50       soil_type37    0.031803
10  wilderness_area1    0.029741
53       soil_type40    0.029696
48       soil_type35    0.027145
16        soil_type3    0.023744
45       soil_type32    0.022238
34       soil_type21    0.021833
11  wilderness_area2    0.021565
12  wilderness_area3    0.020666
44       soil_type31    0.018670
40       soil_type27    0.017045
36       soil_type23    0.016121
../_images/8ea370ec5cccb499e8b748142cda66185f1547d6e186042ed6f2dde746ca493d.png