Income Class Classification: Data Cleaning, EDA, and ML Modeling#
# This data was extracted from the 1994 Census bureau database by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics). A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0)). The prediction task is to determine whether a person makes over $50K a year.
# Dataset source:# https://www.kaggle.com/datasets/uciml/adult-census-income
import pandas as pd
import numpy as np
df = pd.read_csv("../data/adult.csv")
print(df.head())
age workclass fnlwgt education education.num marital.status \
0 90 ? 77053 HS-grad 9 Widowed
1 82 Private 132870 HS-grad 9 Widowed
2 66 ? 186061 Some-college 10 Widowed
3 54 Private 140359 7th-8th 4 Divorced
4 41 Private 264663 Some-college 10 Separated
occupation relationship race sex capital.gain \
0 ? Not-in-family White Female 0
1 Exec-managerial Not-in-family White Female 0
2 ? Unmarried Black Female 0
3 Machine-op-inspct Unmarried White Female 0
4 Prof-specialty Own-child White Female 0
capital.loss hours.per.week native.country income
0 4356 40 United-States <=50K
1 4356 18 United-States <=50K
2 4356 40 United-States <=50K
3 3900 40 United-States <=50K
4 3900 40 United-States <=50K
print(df.tail())
age workclass fnlwgt education education.num marital.status \
32556 22 Private 310152 Some-college 10 Never-married
32557 27 Private 257302 Assoc-acdm 12 Married-civ-spouse
32558 40 Private 154374 HS-grad 9 Married-civ-spouse
32559 58 Private 151910 HS-grad 9 Widowed
32560 22 Private 201490 HS-grad 9 Never-married
occupation relationship race sex capital.gain \
32556 Protective-serv Not-in-family White Male 0
32557 Tech-support Wife White Female 0
32558 Machine-op-inspct Husband White Male 0
32559 Adm-clerical Unmarried White Female 0
32560 Adm-clerical Own-child White Male 0
capital.loss hours.per.week native.country income
32556 0 40 United-States <=50K
32557 0 38 United-States <=50K
32558 0 40 United-States >50K
32559 0 40 United-States <=50K
32560 0 20 United-States <=50K
print(df.shape)
(32561, 15)
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 32561 non-null int64
1 workclass 32561 non-null object
2 fnlwgt 32561 non-null int64
3 education 32561 non-null object
4 education.num 32561 non-null int64
5 marital.status 32561 non-null object
6 occupation 32561 non-null object
7 relationship 32561 non-null object
8 race 32561 non-null object
9 sex 32561 non-null object
10 capital.gain 32561 non-null int64
11 capital.loss 32561 non-null int64
12 hours.per.week 32561 non-null int64
13 native.country 32561 non-null object
14 income 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None
print(df.describe())
age fnlwgt education.num capital.gain capital.loss \
count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000
mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830
std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219
min 17.000000 1.228500e+04 1.000000 0.000000 0.000000
25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000
50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000
75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000
max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000
hours.per.week
count 32561.000000
mean 40.437456
std 12.347429
min 1.000000
25% 40.000000
50% 40.000000
75% 45.000000
max 99.000000
print(df.isna().sum())
age 0
workclass 0
fnlwgt 0
education 0
education.num 0
marital.status 0
occupation 0
relationship 0
race 0
sex 0
capital.gain 0
capital.loss 0
hours.per.week 0
native.country 0
income 0
dtype: int64
df.columns
Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
'marital.status', 'occupation', 'relationship', 'race', 'sex',
'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
'income'],
dtype='object')
print(df.dtypes)
age int64
workclass object
fnlwgt int64
education object
education.num int64
marital.status object
occupation object
relationship object
race object
sex object
capital.gain int64
capital.loss int64
hours.per.week int64
native.country object
income object
dtype: object
object columns are Categorical, and int64 columns are Numerical.
df["occupation"].value_counts()
occupation
Prof-specialty 4140
Craft-repair 4099
Exec-managerial 4066
Adm-clerical 3770
Sales 3650
Other-service 3295
Machine-op-inspct 2002
? 1843
Transport-moving 1597
Handlers-cleaners 1370
Farming-fishing 994
Tech-support 928
Protective-serv 649
Priv-house-serv 149
Armed-Forces 9
Name: count, dtype: int64
categ_col = ["workclass", "education","marital.status", "occupation", "relationship",
"race", "sex", "native.country", "income"]
for col in categ_col:
print(f"\n--- {col} ---")
print(df[col].value_counts())
--- workclass ---
workclass
Private 22696
Self-emp-not-inc 2541
Local-gov 2093
? 1836
State-gov 1298
Self-emp-inc 1116
Federal-gov 960
Without-pay 14
Never-worked 7
Name: count, dtype: int64
--- education ---
education
HS-grad 10501
Some-college 7291
Bachelors 5355
Masters 1723
Assoc-voc 1382
11th 1175
Assoc-acdm 1067
10th 933
7th-8th 646
Prof-school 576
9th 514
12th 433
Doctorate 413
5th-6th 333
1st-4th 168
Preschool 51
Name: count, dtype: int64
--- marital.status ---
marital.status
Married-civ-spouse 14976
Never-married 10683
Divorced 4443
Separated 1025
Widowed 993
Married-spouse-absent 418
Married-AF-spouse 23
Name: count, dtype: int64
--- occupation ---
occupation
Prof-specialty 4140
Craft-repair 4099
Exec-managerial 4066
Adm-clerical 3770
Sales 3650
Other-service 3295
Machine-op-inspct 2002
? 1843
Transport-moving 1597
Handlers-cleaners 1370
Farming-fishing 994
Tech-support 928
Protective-serv 649
Priv-house-serv 149
Armed-Forces 9
Name: count, dtype: int64
--- relationship ---
relationship
Husband 13193
Not-in-family 8305
Own-child 5068
Unmarried 3446
Wife 1568
Other-relative 981
Name: count, dtype: int64
--- race ---
race
White 27816
Black 3124
Asian-Pac-Islander 1039
Amer-Indian-Eskimo 311
Other 271
Name: count, dtype: int64
--- sex ---
sex
Male 21790
Female 10771
Name: count, dtype: int64
--- native.country ---
native.country
United-States 29170
Mexico 643
? 583
Philippines 198
Germany 137
Canada 121
Puerto-Rico 114
El-Salvador 106
India 100
Cuba 95
England 90
Jamaica 81
South 80
China 75
Italy 73
Dominican-Republic 70
Vietnam 67
Guatemala 64
Japan 62
Poland 60
Columbia 59
Taiwan 51
Haiti 44
Iran 43
Portugal 37
Nicaragua 34
Peru 31
Greece 29
France 29
Ecuador 28
Ireland 24
Hong 20
Cambodia 19
Trinadad&Tobago 19
Laos 18
Thailand 18
Yugoslavia 16
Outlying-US(Guam-USVI-etc) 14
Hungary 13
Honduras 13
Scotland 12
Holand-Netherlands 1
Name: count, dtype: int64
--- income ---
income
<=50K 24720
>50K 7841
Name: count, dtype: int64
df['workclass'] = df['workclass'].replace('?', 'Unknown')
df['occupation'] = df['occupation'].replace('?', 'Unknown')
Like done previously, those missing values might be labelled as Unknown, I do prefer filling them via Mode which is the most repeated valu of the column. Over 90% of native.country data is “United-States.” Statistically, there is a massive chance those 583 missing values are also US-based.
# find mode of the specific column
country_missing = df["native.country"].mode()[0]
# fill the missing ? native country values with those mode.
df["native.country"] = df["native.country"].replace("?", country_missing)
Native.country column has only 1 labelled as Holand-Netherlands value, which is outlier. Having only 1 row for a country is “noise.” Machine learning models can’t learn anything from a single example; it actually makes it harder for the model to generalize.
df = df[df['native.country'] != 'Holand-Netherlands']
df.head()
| age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 90 | Unknown | 77053 | HS-grad | 9 | Widowed | Unknown | Not-in-family | White | Female | 0 | 4356 | 40 | United-States | <=50K |
| 1 | 82 | Private | 132870 | HS-grad | 9 | Widowed | Exec-managerial | Not-in-family | White | Female | 0 | 4356 | 18 | United-States | <=50K |
| 2 | 66 | Unknown | 186061 | Some-college | 10 | Widowed | Unknown | Unmarried | Black | Female | 0 | 4356 | 40 | United-States | <=50K |
| 3 | 54 | Private | 140359 | 7th-8th | 4 | Divorced | Machine-op-inspct | Unmarried | White | Female | 0 | 3900 | 40 | United-States | <=50K |
| 4 | 41 | Private | 264663 | Some-college | 10 | Separated | Prof-specialty | Own-child | White | Female | 0 | 3900 | 40 | United-States | <=50K |
# Dropping 'fnlwgt' because it represents census sampling weights rather than
# individual characteristics. It generally doesn't help predict a person's income.
df = df.drop('fnlwgt', axis=1)
df.columns
Index(['age', 'workclass', 'education', 'education.num', 'marital.status',
'occupation', 'relationship', 'race', 'sex', 'capital.gain',
'capital.loss', 'hours.per.week', 'native.country', 'income'],
dtype='object')
print(df["education"].value_counts())
print(df["education.num"].value_counts())
education
HS-grad 10501
Some-college 7290
Bachelors 5355
Masters 1723
Assoc-voc 1382
11th 1175
Assoc-acdm 1067
10th 933
7th-8th 646
Prof-school 576
9th 514
12th 433
Doctorate 413
5th-6th 333
1st-4th 168
Preschool 51
Name: count, dtype: int64
education.num
9 10501
10 7290
13 5355
14 1723
11 1382
7 1175
12 1067
6 933
4 646
15 576
5 514
8 433
16 413
3 333
2 168
1 51
Name: count, dtype: int64
Current column names (like education.num and marital.status) use dots, which can cause issues in certain Python libraries (like Scikit-learn) or when using the df.column_name dot-notation in pandas.
# Direct rename to get rid of dots
df = df.rename(columns={
"education.num": "education_num",
"marital.status": "marital_status",
"capital.gain": "capital_gain",
"capital.loss": "capital_loss",
"hours.per.week": "hours_per_week",
"native.country": "native_country"
})
print(df.columns.tolist())
['age', 'workclass', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
Classification#
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
classification_report,
confusion_matrix
)
X = df.drop("income", axis=1)
y = df["income"]
# first cut: 80 train, 20 temp
X_train, X_temp, y_train, y_temp = train_test_split(
X, y,
test_size=0.20,
random_state=42,
stratify=y
)
# second cut: 10 validation, 10 test
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp,
test_size=0.50,
random_state=42,
stratify=y_temp
)
print("X_train shape:", X_train.shape)
print("X_val shape :", X_val.shape)
print("X_test shape :", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape :", y_val.shape)
print("y_test shape :", y_test.shape)
X_train shape: (26048, 13)
X_val shape : (3256, 13)
X_test shape : (3256, 13)
y_train shape: (26048,)
y_val shape : (3256,)
y_test shape : (3256,)
y_train = y_train.str.strip().map({"<=50K": 0, ">50K": 1})
y_val = y_val.str.strip().map({"<=50K": 0, ">50K": 1})
y_test = y_test.str.strip().map({"<=50K": 0, ">50K": 1})
categorical_cols = X_train.select_dtypes(include="object").columns.tolist()
numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)
Categorical columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
Numerical columns: ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
preprocessor = ColumnTransformer(
transformers=[
("num", StandardScaler(), numerical_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
]
)
models = {
"Logistic Regression": LogisticRegression(random_state=42, max_iter=2000),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(random_state=42),
"Gradient Boosting": GradientBoostingClassifier(random_state=42)
}
param_distributions = {
"Logistic Regression": {
"model__C": np.logspace(-3, 2, 20),
"model__solver": ["liblinear", "lbfgs"],
"model__penalty": ["l2"]
},
"Decision Tree": {
"model__max_depth": [None, 5, 10, 15, 20, 30],
"model__min_samples_split": [2, 5, 10, 20],
"model__min_samples_leaf": [1, 2, 4, 8],
"model__criterion": ["gini", "entropy"]
},
"Random Forest": {
"model__n_estimators": [100, 200],
"model__max_depth": [None, 10, 20],
"model__min_samples_split": [2, 5],
"model__min_samples_leaf": [1, 2],
"model__max_features": ["sqrt", "log2"]
},
"Gradient Boosting": {
"model__n_estimators": [100, 200],
"model__learning_rate": [0.05, 0.1],
"model__max_depth": [3, 4],
"model__min_samples_split": [2, 5],
"model__min_samples_leaf": [1, 2]
}
}
tuned_models = {}
results = []
for model_name, model in models.items():
print(f"\n{'='*60}")
print(f"Running RandomizedSearchCV for: {model_name}")
print(f"{'='*60}")
pipeline = Pipeline([
("preprocessor", preprocessor),
("model", model)
])
random_search = RandomizedSearchCV(
estimator=pipeline,
param_distributions=param_distributions[model_name],
n_iter=15,
scoring="f1",
cv=5,
verbose=3,
random_state=42,
n_jobs=-1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
tuned_models[model_name] = best_model
y_val_pred = best_model.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred)
rec = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
results.append({
"model": model_name,
"best_cv_f1": random_search.best_score_,
"val_accuracy": acc,
"val_precision": prec,
"val_recall": rec,
"val_f1": f1,
"best_params": random_search.best_params_
})
============================================================
Running RandomizedSearchCV for: Logistic Regression
============================================================
Fitting 5 folds for each of 15 candidates, totalling 75 fits
============================================================
Running RandomizedSearchCV for: Decision Tree
============================================================
Fitting 5 folds for each of 15 candidates, totalling 75 fits
============================================================
Running RandomizedSearchCV for: Random Forest
============================================================
Fitting 5 folds for each of 15 candidates, totalling 75 fits
============================================================
Running RandomizedSearchCV for: Gradient Boosting
============================================================
Fitting 5 folds for each of 15 candidates, totalling 75 fits
results_df = pd.DataFrame(results).sort_values(by="val_f1", ascending=False)
print(results_df[["model", "best_cv_f1", "val_accuracy", "val_precision", "val_recall", "val_f1"]])
model best_cv_f1 val_accuracy val_precision val_recall \
3 Gradient Boosting 0.712974 0.865172 0.776000 0.618622
2 Random Forest 0.686117 0.859951 0.759494 0.612245
0 Logistic Regression 0.657668 0.850737 0.735759 0.593112
1 Decision Tree 0.666943 0.852273 0.783178 0.534439
val_f1
3 0.688432
2 0.677966
0 0.656780
1 0.635330
best_model_name = results_df.iloc[0]["model"]
print("Best tuned model:", best_model_name)
best_final_model = tuned_models[best_model_name]
joblib.dump(best_final_model, "best_adult_income_model.pkl")
print("Best model saved as best_adult_income_model.pkl")
Best tuned model: Gradient Boosting
Best model saved as best_adult_income_model.pkl
y_val_pred_best = best_final_model.predict(X_val)
print("Validation Accuracy :", accuracy_score(y_val, y_val_pred_best))
print("Validation Precision:", precision_score(y_val, y_val_pred_best))
print("Validation Recall :", recall_score(y_val, y_val_pred_best))
print("Validation F1 :", f1_score(y_val, y_val_pred_best))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_best))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_best))
Validation Accuracy : 0.8651719901719902
Validation Precision: 0.776
Validation Recall : 0.6186224489795918
Validation F1 : 0.6884315117104329
Confusion Matrix:
[[2332 140]
[ 299 485]]
Classification Report:
precision recall f1-score support
0 0.89 0.94 0.91 2472
1 0.78 0.62 0.69 784
accuracy 0.87 3256
macro avg 0.83 0.78 0.80 3256
weighted avg 0.86 0.87 0.86 3256
y_test_pred = best_final_model.predict(X_test)
print("Test Accuracy :", accuracy_score(y_test, y_test_pred))
print("Test Precision:", precision_score(y_test, y_test_pred))
print("Test Recall :", recall_score(y_test, y_test_pred))
print("Test F1 :", f1_score(y_test, y_test_pred))
print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))
Test Accuracy : 0.8777641277641277
Test Precision: 0.7924242424242425
Test Recall : 0.6670918367346939
Test F1 : 0.724376731301939
Test Confusion Matrix:
[[2335 137]
[ 261 523]]
Test Classification Report:
precision recall f1-score support
0 0.90 0.94 0.92 2472
1 0.79 0.67 0.72 784
accuracy 0.88 3256
macro avg 0.85 0.81 0.82 3256
weighted avg 0.87 0.88 0.87 3256