Ocean Temperature Prediction Using CalCOFI Environmental Data#
CalCOFI: Over 60 years of oceanographic data
import pandas as pd
import numpy as np
bottle = pd.read_csv("../data/bottle.csv", low_memory=False)
cast = pd.read_csv("../data/cast.csv", low_memory=False)
cast_cols = [
"Cst_Cnt",
"Year",
"Month",
"Quarter",
"Lat_Dec",
"Lon_Dec",
"Bottom_D",
"Distance"
]
df = bottle.merge(
cast[cast_cols],
on="Cst_Cnt",
how="left"
)
df.shape
(864863, 81)
df = df.dropna(subset=["T_degC"]).copy()
rename_map = {
"T_degC": "target_temperature_celsius",
"Depthm": "depth_meters",
"Salnty": "salinity",
"O2ml_L": "oxygen_ml_per_liter",
"PO4uM": "phosphate_umol",
"SiO3uM": "silicate_umol",
"NO2uM": "nitrite_umol",
"NO3uM": "nitrate_umol",
"Lat_Dec": "latitude",
"Lon_Dec": "longitude",
"Bottom_D": "bottom_depth",
"Distance": "distance_from_coast",
"Year": "year",
"Month": "month",
"Quarter": "quarter"
}
df = df.rename(columns=rename_map)
target = "target_temperature_celsius"
features = [
"depth_meters",
"salinity",
"oxygen_ml_per_liter",
"phosphate_umol",
"silicate_umol",
"nitrite_umol",
"nitrate_umol",
"year",
"month",
"latitude",
"longitude",
"bottom_depth",
"distance_from_coast"
]
train_cv_df = df[df["year"] <= 2014].copy()
test_df = df[df["year"] >= 2015].copy()
train_cv_df = train_cv_df.sort_values(
["year", "month", "Cst_Cnt", "depth_meters"]
).reset_index(drop=True)
test_df = test_df.sort_values(
["year", "month", "Cst_Cnt", "depth_meters"]
).reset_index(drop=True)
X_train_cv = train_cv_df[features]
y_train_cv = train_cv_df[target]
X_test = test_df[features]
y_test = test_df[target]
from sklearn.model_selection import TimeSeriesSplit
years = np.sort(train_cv_df["year"].unique())
tscv = TimeSeriesSplit(n_splits=5)
for fold, (train_year_idx, val_year_idx) in enumerate(tscv.split(years), start=1):
train_years = years[train_year_idx]
val_years = years[val_year_idx]
print(
f"Fold {fold}: "
f"Train {train_years[0]}–{train_years[-1]} | "
f"Validate {val_years[0]}–{val_years[-1]}"
)
Fold 1: Train 1949–1963 | Validate 1964–1973
Fold 2: Train 1949–1973 | Validate 1974–1984
Fold 3: Train 1949–1984 | Validate 1985–1994
Fold 4: Train 1949–1994 | Validate 1995–2004
Fold 5: Train 1949–2004 | Validate 2005–2014
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
ridge_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median", add_indicator=True)),
("scaler", StandardScaler()),
("model", Ridge(alpha=10.0))
])
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def regression_metrics(y_true, y_pred):
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
return mae, rmse, r2
cv_results = []
for fold, (train_year_idx, val_year_idx) in enumerate(tscv.split(years), start=1):
train_years = years[train_year_idx]
val_years = years[val_year_idx]
train_mask = train_cv_df["year"].isin(train_years)
val_mask = train_cv_df["year"].isin(val_years)
X_train_fold = train_cv_df.loc[train_mask, features]
y_train_fold = train_cv_df.loc[train_mask, target]
X_val_fold = train_cv_df.loc[val_mask, features]
y_val_fold = train_cv_df.loc[val_mask, target]
ridge_pipeline.fit(X_train_fold, y_train_fold)
val_pred = ridge_pipeline.predict(X_val_fold)
mae, rmse, r2 = regression_metrics(y_val_fold, val_pred)
cv_results.append({
"fold": fold,
"train_years": f"{train_years[0]}–{train_years[-1]}",
"validation_years": f"{val_years[0]}–{val_years[-1]}",
"MAE": mae,
"RMSE": rmse,
"R2": r2
})
cv_results_df = pd.DataFrame(cv_results)
cv_results_df
| fold | train_years | validation_years | MAE | RMSE | R2 | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1949–1963 | 1964–1973 | 2.502009 | 3.128031 | 0.483301 |
| 1 | 2 | 1949–1973 | 1974–1984 | 1.815000 | 2.334075 | 0.669461 |
| 2 | 3 | 1949–1984 | 1985–1994 | 1.289910 | 1.696874 | 0.801674 |
| 3 | 4 | 1949–1994 | 1995–2004 | 1.126286 | 1.518651 | 0.833472 |
| 4 | 5 | 1949–2004 | 2005–2014 | 1.096580 | 1.470248 | 0.831887 |
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
def regression_metrics(y_true, y_pred):
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
return mae, rmse, r2
def evaluate_model_timeseries(model, model_name, scale=True):
results = []
for fold, (train_year_idx, val_year_idx) in enumerate(tscv.split(years), start=1):
train_years = years[train_year_idx]
val_years = years[val_year_idx]
train_mask = train_cv_df["year"].isin(train_years)
val_mask = train_cv_df["year"].isin(val_years)
X_train_fold = train_cv_df.loc[train_mask, features]
y_train_fold = train_cv_df.loc[train_mask, target]
X_val_fold = train_cv_df.loc[val_mask, features]
y_val_fold = train_cv_df.loc[val_mask, target]
if scale:
pipe = Pipeline([
("imputer", SimpleImputer(strategy="median", add_indicator=True)),
("scaler", StandardScaler()),
("model", model)
])
else:
pipe = Pipeline([
("imputer", SimpleImputer(strategy="median", add_indicator=True)),
("model", model)
])
pipe.fit(X_train_fold, y_train_fold)
train_pred = pipe.predict(X_train_fold)
val_pred = pipe.predict(X_val_fold)
train_mae, train_rmse, train_r2 = regression_metrics(y_train_fold, train_pred)
val_mae, val_rmse, val_r2 = regression_metrics(y_val_fold, val_pred)
results.append({
"model": model_name,
"fold": fold,
"train_years": f"{train_years[0]}–{train_years[-1]}",
"validation_years": f"{val_years[0]}–{val_years[-1]}",
"train_MAE": train_mae,
"train_RMSE": train_rmse,
"train_R2": train_r2,
"val_MAE": val_mae,
"val_RMSE": val_rmse,
"val_R2": val_r2
})
return pd.DataFrame(results)
from sklearn.linear_model import Ridge
ridge_alphas = [0.01, 0.1, 1, 10, 50, 100, 500, 1000]
all_ridge_results = []
for alpha in ridge_alphas:
model = Ridge(alpha=alpha)
result = evaluate_model_timeseries(
model=model,
model_name=f"Ridge_alpha_{alpha}",
scale=True
)
all_ridge_results.append(result)
ridge_tuning_df = pd.concat(all_ridge_results, ignore_index=True)
ridge_summary = (
ridge_tuning_df
.groupby("model")
.agg(
mean_train_MAE=("train_MAE", "mean"),
mean_val_MAE=("val_MAE", "mean"),
mean_train_RMSE=("train_RMSE", "mean"),
mean_val_RMSE=("val_RMSE", "mean"),
mean_train_R2=("train_R2", "mean"),
mean_val_R2=("val_R2", "mean")
)
.sort_values("mean_val_MAE")
)
ridge_summary
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| Ridge_alpha_0.01 | 1.682884 | 1.565956 | 2.262360 | 2.029577 | 0.741926 | 0.723958 |
| Ridge_alpha_0.1 | 1.682884 | 1.565956 | 2.262360 | 2.029577 | 0.741926 | 0.723958 |
| Ridge_alpha_1 | 1.682883 | 1.565956 | 2.262360 | 2.029577 | 0.741926 | 0.723958 |
| Ridge_alpha_10 | 1.682882 | 1.565957 | 2.262360 | 2.029576 | 0.741926 | 0.723959 |
| Ridge_alpha_50 | 1.682877 | 1.565962 | 2.262361 | 2.029572 | 0.741926 | 0.723962 |
| Ridge_alpha_100 | 1.682870 | 1.565969 | 2.262361 | 2.029568 | 0.741926 | 0.723966 |
| Ridge_alpha_500 | 1.682817 | 1.566023 | 2.262366 | 2.029537 | 0.741924 | 0.723998 |
| Ridge_alpha_1000 | 1.682758 | 1.566095 | 2.262382 | 2.029504 | 0.741921 | 0.724035 |
from sklearn.tree import DecisionTreeRegressor
tree_models = [
("DecisionTree_depth_5", DecisionTreeRegressor(max_depth=5, random_state=42)),
("DecisionTree_depth_10", DecisionTreeRegressor(max_depth=10, random_state=42)),
("DecisionTree_depth_15", DecisionTreeRegressor(max_depth=15, random_state=42)),
("DecisionTree_depth_20", DecisionTreeRegressor(max_depth=20, random_state=42)),
]
tree_results = []
for name, model in tree_models:
result = evaluate_model_timeseries(
model=model,
model_name=name,
scale=False
)
tree_results.append(result)
tree_results_df = pd.concat(tree_results, ignore_index=True)
tree_summary = (
tree_results_df
.groupby("model")
.agg(
mean_train_MAE=("train_MAE", "mean"),
mean_val_MAE=("val_MAE", "mean"),
mean_train_RMSE=("train_RMSE", "mean"),
mean_val_RMSE=("val_RMSE", "mean"),
mean_train_R2=("train_R2", "mean"),
mean_val_R2=("val_R2", "mean")
)
.sort_values("mean_val_MAE")
)
tree_summary
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| DecisionTree_depth_15 | 0.365398 | 0.599433 | 0.568175 | 0.979580 | 0.983640 | 0.933012 |
| DecisionTree_depth_20 | 0.155387 | 0.620787 | 0.285406 | 1.017554 | 0.995810 | 0.927901 |
| DecisionTree_depth_10 | 0.586091 | 0.654021 | 0.887020 | 1.009999 | 0.960305 | 0.930502 |
| DecisionTree_depth_5 | 0.961570 | 0.858932 | 1.409718 | 1.243463 | 0.899870 | 0.898872 |
from sklearn.ensemble import HistGradientBoostingRegressor
hgb_models = [
(
"HGB_lr_0.05_depth_6",
HistGradientBoostingRegressor(
learning_rate=0.05,
max_iter=200,
max_leaf_nodes=31,
max_depth=6,
l2_regularization=0.0,
random_state=42
)
),
(
"HGB_lr_0.05_depth_10",
HistGradientBoostingRegressor(
learning_rate=0.05,
max_iter=200,
max_leaf_nodes=31,
max_depth=10,
l2_regularization=0.0,
random_state=42
)
),
(
"HGB_lr_0.1_depth_6",
HistGradientBoostingRegressor(
learning_rate=0.1,
max_iter=200,
max_leaf_nodes=31,
max_depth=6,
l2_regularization=0.0,
random_state=42
)
),
(
"HGB_lr_0.1_depth_10",
HistGradientBoostingRegressor(
learning_rate=0.1,
max_iter=200,
max_leaf_nodes=31,
max_depth=10,
l2_regularization=0.0,
random_state=42
)
)
]
hgb_results = []
for name, model in hgb_models:
result = evaluate_model_timeseries(
model=model,
model_name=name,
scale=False
)
hgb_results.append(result)
hgb_results_df = pd.concat(hgb_results, ignore_index=True)
hgb_summary = (
hgb_results_df
.groupby("model")
.agg(
mean_train_MAE=("train_MAE", "mean"),
mean_val_MAE=("val_MAE", "mean"),
mean_train_RMSE=("train_RMSE", "mean"),
mean_val_RMSE=("val_RMSE", "mean"),
mean_train_R2=("train_R2", "mean"),
mean_val_R2=("val_R2", "mean")
)
.sort_values("mean_val_MAE")
)
hgb_summary
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| HGB_lr_0.1_depth_6 | 0.440810 | 0.607332 | 0.653321 | 0.819093 | 0.978479 | 0.951920 |
| HGB_lr_0.05_depth_6 | 0.480591 | 0.609946 | 0.710915 | 0.823207 | 0.974516 | 0.953166 |
| HGB_lr_0.05_depth_10 | 0.470636 | 0.633857 | 0.692547 | 0.847645 | 0.975819 | 0.949125 |
| HGB_lr_0.1_depth_10 | 0.432193 | 0.642879 | 0.635330 | 0.855562 | 0.979645 | 0.945839 |
from sklearn.ensemble import RandomForestRegressor
rf_models = [
(
"RF_100_depth_10",
RandomForestRegressor(
n_estimators=100,
max_depth=10,
min_samples_leaf=5,
n_jobs=-1,
random_state=42
)
),
(
"RF_100_depth_15",
RandomForestRegressor(
n_estimators=100,
max_depth=15,
min_samples_leaf=5,
n_jobs=-1,
random_state=42
)
)
]
rf_results = []
for name, model in rf_models:
result = evaluate_model_timeseries(
model=model,
model_name=name,
scale=False
)
rf_results.append(result)
rf_results_df = pd.concat(rf_results, ignore_index=True)
rf_summary = (
rf_results_df
.groupby("model")
.agg(
mean_train_MAE=("train_MAE", "mean"),
mean_val_MAE=("val_MAE", "mean"),
mean_train_RMSE=("train_RMSE", "mean"),
mean_val_RMSE=("val_RMSE", "mean"),
mean_train_R2=("train_R2", "mean"),
mean_val_R2=("val_R2", "mean")
)
.sort_values("mean_val_MAE")
)
rf_summary
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| RF_100_depth_15 | 0.335501 | 0.514797 | 0.512193 | 0.798657 | 0.986723 | 0.954657 |
| RF_100_depth_10 | 0.544814 | 0.612853 | 0.822920 | 0.913149 | 0.965817 | 0.943263 |
model_comparison = pd.concat(
[
ridge_summary,
tree_summary,
hgb_summary,
rf_summary
],
axis=0
).sort_values("mean_val_MAE")
model_comparison
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| RF_100_depth_15 | 0.335501 | 0.514797 | 0.512193 | 0.798657 | 0.986723 | 0.954657 |
| DecisionTree_depth_15 | 0.365398 | 0.599433 | 0.568175 | 0.979580 | 0.983640 | 0.933012 |
| HGB_lr_0.1_depth_6 | 0.440810 | 0.607332 | 0.653321 | 0.819093 | 0.978479 | 0.951920 |
| HGB_lr_0.05_depth_6 | 0.480591 | 0.609946 | 0.710915 | 0.823207 | 0.974516 | 0.953166 |
| RF_100_depth_10 | 0.544814 | 0.612853 | 0.822920 | 0.913149 | 0.965817 | 0.943263 |
| DecisionTree_depth_20 | 0.155387 | 0.620787 | 0.285406 | 1.017554 | 0.995810 | 0.927901 |
| HGB_lr_0.05_depth_10 | 0.470636 | 0.633857 | 0.692547 | 0.847645 | 0.975819 | 0.949125 |
| HGB_lr_0.1_depth_10 | 0.432193 | 0.642879 | 0.635330 | 0.855562 | 0.979645 | 0.945839 |
| DecisionTree_depth_10 | 0.586091 | 0.654021 | 0.887020 | 1.009999 | 0.960305 | 0.930502 |
| DecisionTree_depth_5 | 0.961570 | 0.858932 | 1.409718 | 1.243463 | 0.899870 | 0.898872 |
| Ridge_alpha_0.01 | 1.682884 | 1.565956 | 2.262360 | 2.029577 | 0.741926 | 0.723958 |
| Ridge_alpha_0.1 | 1.682884 | 1.565956 | 2.262360 | 2.029577 | 0.741926 | 0.723958 |
| Ridge_alpha_1 | 1.682883 | 1.565956 | 2.262360 | 2.029577 | 0.741926 | 0.723958 |
| Ridge_alpha_10 | 1.682882 | 1.565957 | 2.262360 | 2.029576 | 0.741926 | 0.723959 |
| Ridge_alpha_50 | 1.682877 | 1.565962 | 2.262361 | 2.029572 | 0.741926 | 0.723962 |
| Ridge_alpha_100 | 1.682870 | 1.565969 | 2.262361 | 2.029568 | 0.741926 | 0.723966 |
| Ridge_alpha_500 | 1.682817 | 1.566023 | 2.262366 | 2.029537 | 0.741924 | 0.723998 |
| Ridge_alpha_1000 | 1.682758 | 1.566095 | 2.262382 | 2.029504 | 0.741921 | 0.724035 |
rf_tuned_models = [
(
"RF_150_depth_15_leaf_5",
RandomForestRegressor(
n_estimators=150,
max_depth=15,
min_samples_leaf=5,
max_features="sqrt",
n_jobs=-1,
random_state=42
)
),
(
"RF_200_depth_15_leaf_5",
RandomForestRegressor(
n_estimators=200,
max_depth=15,
min_samples_leaf=5,
max_features="sqrt",
n_jobs=-1,
random_state=42
)
),
(
"RF_150_depth_18_leaf_5",
RandomForestRegressor(
n_estimators=150,
max_depth=18,
min_samples_leaf=5,
max_features="sqrt",
n_jobs=-1,
random_state=42
)
),
(
"RF_150_depth_15_leaf_3",
RandomForestRegressor(
n_estimators=150,
max_depth=15,
min_samples_leaf=3,
max_features="sqrt",
n_jobs=-1,
random_state=42
)
)
]
rf_tuned_results = []
for name, model in rf_tuned_models:
result = evaluate_model_timeseries(
model=model,
model_name=name,
scale=False
)
rf_tuned_results.append(result)
rf_tuned_df = pd.concat(rf_tuned_results, ignore_index=True)
rf_tuned_summary = (
rf_tuned_df
.groupby("model")
.agg(
mean_train_MAE=("train_MAE", "mean"),
mean_val_MAE=("val_MAE", "mean"),
mean_train_RMSE=("train_RMSE", "mean"),
mean_val_RMSE=("val_RMSE", "mean"),
mean_train_R2=("train_R2", "mean"),
mean_val_R2=("val_R2", "mean")
)
.sort_values("mean_val_MAE")
)
rf_tuned_summary
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| RF_150_depth_18_leaf_5 | 0.366232 | 0.479001 | 0.549087 | 0.707585 | 0.984800 | 0.965903 |
| RF_150_depth_15_leaf_5 | 0.454583 | 0.493641 | 0.669212 | 0.725675 | 0.977409 | 0.964364 |
| RF_200_depth_15_leaf_5 | 0.454409 | 0.493948 | 0.668912 | 0.724926 | 0.977428 | 0.964419 |
| RF_150_depth_15_leaf_3 | 0.446824 | 0.497121 | 0.655967 | 0.723273 | 0.978287 | 0.964589 |
hgb_tuned_models = [
(
"HGB_lr_0.08_depth_6_iter_300_l2_0",
HistGradientBoostingRegressor(
learning_rate=0.08,
max_iter=300,
max_depth=6,
max_leaf_nodes=31,
l2_regularization=0.0,
random_state=42
)
),
(
"HGB_lr_0.05_depth_6_iter_400_l2_0",
HistGradientBoostingRegressor(
learning_rate=0.05,
max_iter=400,
max_depth=6,
max_leaf_nodes=31,
l2_regularization=0.0,
random_state=42
)
),
(
"HGB_lr_0.1_depth_6_iter_300_l2_0.01",
HistGradientBoostingRegressor(
learning_rate=0.1,
max_iter=300,
max_depth=6,
max_leaf_nodes=31,
l2_regularization=0.01,
random_state=42
)
),
(
"HGB_lr_0.08_depth_8_iter_300_l2_0.01",
HistGradientBoostingRegressor(
learning_rate=0.08,
max_iter=300,
max_depth=8,
max_leaf_nodes=31,
l2_regularization=0.01,
random_state=42
)
)
]
hgb_tuned_results = []
for name, model in hgb_tuned_models:
result = evaluate_model_timeseries(
model=model,
model_name=name,
scale=False
)
hgb_tuned_results.append(result)
hgb_tuned_df = pd.concat(hgb_tuned_results, ignore_index=True)
hgb_tuned_summary = (
hgb_tuned_df
.groupby("model")
.agg(
mean_train_MAE=("train_MAE", "mean"),
mean_val_MAE=("val_MAE", "mean"),
mean_train_RMSE=("train_RMSE", "mean"),
mean_val_RMSE=("val_RMSE", "mean"),
mean_train_R2=("train_R2", "mean"),
mean_val_R2=("val_R2", "mean")
)
.sort_values("mean_val_MAE")
)
hgb_tuned_summary
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| HGB_lr_0.1_depth_6_iter_300_l2_0.01 | 0.418488 | 0.588808 | 0.621999 | 0.801887 | 0.980496 | 0.953815 |
| HGB_lr_0.08_depth_6_iter_300_l2_0 | 0.430455 | 0.605328 | 0.639342 | 0.820997 | 0.979390 | 0.951453 |
| HGB_lr_0.05_depth_6_iter_400_l2_0 | 0.439307 | 0.607189 | 0.651902 | 0.818582 | 0.978573 | 0.952200 |
| HGB_lr_0.08_depth_8_iter_300_l2_0.01 | 0.423710 | 0.636932 | 0.625902 | 0.852140 | 0.980245 | 0.945434 |
finalist_comparison = pd.concat(
[
model_comparison.loc[["RF_100_depth_15", "HGB_lr_0.1_depth_6"]],
rf_tuned_summary,
hgb_tuned_summary
],
axis=0
).sort_values("mean_val_MAE")
finalist_comparison
| mean_train_MAE | mean_val_MAE | mean_train_RMSE | mean_val_RMSE | mean_train_R2 | mean_val_R2 | |
|---|---|---|---|---|---|---|
| model | ||||||
| RF_150_depth_18_leaf_5 | 0.366232 | 0.479001 | 0.549087 | 0.707585 | 0.984800 | 0.965903 |
| RF_150_depth_15_leaf_5 | 0.454583 | 0.493641 | 0.669212 | 0.725675 | 0.977409 | 0.964364 |
| RF_200_depth_15_leaf_5 | 0.454409 | 0.493948 | 0.668912 | 0.724926 | 0.977428 | 0.964419 |
| RF_150_depth_15_leaf_3 | 0.446824 | 0.497121 | 0.655967 | 0.723273 | 0.978287 | 0.964589 |
| RF_100_depth_15 | 0.335501 | 0.514797 | 0.512193 | 0.798657 | 0.986723 | 0.954657 |
| HGB_lr_0.1_depth_6_iter_300_l2_0.01 | 0.418488 | 0.588808 | 0.621999 | 0.801887 | 0.980496 | 0.953815 |
| HGB_lr_0.08_depth_6_iter_300_l2_0 | 0.430455 | 0.605328 | 0.639342 | 0.820997 | 0.979390 | 0.951453 |
| HGB_lr_0.05_depth_6_iter_400_l2_0 | 0.439307 | 0.607189 | 0.651902 | 0.818582 | 0.978573 | 0.952200 |
| HGB_lr_0.1_depth_6 | 0.440810 | 0.607332 | 0.653321 | 0.819093 | 0.978479 | 0.951920 |
| HGB_lr_0.08_depth_8_iter_300_l2_0.01 | 0.423710 | 0.636932 | 0.625902 | 0.852140 | 0.980245 | 0.945434 |
Final !
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
final_rf_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median", add_indicator=True)),
("model", RandomForestRegressor(
n_estimators=150,
max_depth=18,
min_samples_leaf=5,
max_features="sqrt",
n_jobs=-1,
random_state=42
))
])
final_rf_pipeline.fit(X_train_cv, y_train_cv)
final_test_pred = final_rf_pipeline.predict(X_test)
final_test_mae, final_test_rmse, final_test_r2 = regression_metrics(
y_test,
final_test_pred
)
print("Final Tuned Random Forest Test Results")
print("MAE :", final_test_mae)
print("RMSE:", final_test_rmse)
print("R² :", final_test_r2)
Final Tuned Random Forest Test Results
MAE : 0.43265300737751977
RMSE: 0.6991607339656488
R² : 0.9706738906233219