๐ง๐ป ์ฝ๋ ์ข ํฉ
(3)๊น์ง ์งํํ ๊ฒฐ๊ณผ๋ฅผ ์ข ํฉํ๋ฉด ๋ชจ๋ธ๋ง ๋ฒ ์ด์ค ๋ผ์ธ ์ฝ๋๋ ๋ค์๊ณผ ๊ฐ๋ค.
์ด๋ฒ ์๊ฐ์๋ 3๊ฐ์ ๋ชจ๋ธ ๋ณ๋ก K-fold ๊ต์ฐจ ๊ฒ์ฆ์ ์ํํ๊ณ , ๊ทธ ๊ฒฐ๊ณผ ์ ์(MAE)์ ํ๊ท ๊ฐ์ ์๊ฐํํ์ฌ ๋น๊ตํด๋ณธ๋ค.
์ด๋ ๊ฒ 1์ฐจ ๋ฒ ์ด์ค๋ผ์ธ ๋ชจ๋ธ๋ง ์ฝ๋๋ฅผ ๋ง๋ฌด๋ฆฌํ๋ค!
1. ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
# ๋ผ๋ฒจ ์ธ์ฝ๋ฉ(sex)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['generated'] = 1
original['generated'] = 0
test['generated'] = 1
train.drop(columns = 'id', axis = 1, inplace = True)
train = pd.concat([train, original], axis = 0).reset_index(drop = True)
train['Sex'] = le.fit_transform(train['Sex'])
2. ๋ณ์ ๋ถ๋ฆฌ(๋ ๋ฆฝ๋ณ์, ์ข ์๋ณ์)
# X, Y ๋ณ์ ๋ถ๋ฆฌ
X = train.drop(columns = 'Age', axis = 1)
Y = train['Age']
# id ์ปฌ๋ผ ์ญ์ & test ๋ฐ์ดํฐ๋ ๋ผ๋ฒจ์ธ์ฝ๋ฉ
test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['Sex'] = le.transform(test_baseline['Sex'])
3. ๋ชจ๋ธ๋ณ MAE์ ์์ธก๊ฐ์ ๋ด์ ๋ฆฌ์คํธ ์์ฑ
# ๊ฒฐ๊ณผ ์ ์(MAE)์ ์์ธก๊ฐ์ ๋ด์ ๋ฆฌ์คํธ ์์ฑ --->> ๋ชจ๋ธ ๊ฐ์ ๋งํผ
gb_cv_scores, gb_preds = list(), list()
hist_cv_scores, hist_preds = list(), list()
lgb_cv_scores, lgb_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()
4. k-fold ๊ต์ฐจ ๊ฒ์ฆ ์ํ
# K-fold ์์ฑ
skf = KFold(n_splits = 10, random_state = 42, shuffle = True)
5. k-fold ๋ง๋ค ์ ์ฉํ ๋ชจ๋ธ ์์ฑ ๋ฐ ํ์ต, ์์ธก ์ข ํฉ
- ๋ชจ๋ธ ์์ ์ ์ฝ๋ฉ์์ ์ํํ์ผ๋ฉฐ, ๊ฐ๋จํ Gradient Boosting & Hist Gradient Boosting & XGBoosting ๊ฒฐ๊ณผ๋ฅผ ํจ๊ป ๋ณด๊ธฐ๋ก ํ์
- ๊ฒฐ๊ณผ๋ For ๋ฌธ์ ํ ๋ฒ ๋ ๋๋ง๋ค ์ด๋ ๊ฒ ๋์จ๋ค
for i, (train_ix, test_ix) in enumerate(skf.split(X, Y)):
X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
print('---------------------------------------------------------------')
######################
## GradientBoosting ##
######################
gb_md = GradientBoostingRegressor(loss = 'absolute_error',
n_estimators = 1000,
max_depth = 8,
learning_rate = 0.01,
min_samples_split = 10,
min_samples_leaf = 20).fit(X_train, Y_train)
gb_pred_1 = gb_md.predict(X_test[X_test['generated'] == 1])
gb_pred_2 = gb_md.predict(test_baseline)
gb_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], gb_pred_1)
gb_cv_scores.append(gb_score_fold)
gb_preds.append(gb_pred_2)
print('Fold', i, '==> GradientBoositng oof MAE is ==>', gb_score_fold)
##########################
## HistGradientBoosting ##
##########################
hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',
l2_regularization = 0.01,
early_stopping = False,
learning_rate = 0.01,
max_iter = 1000,
max_depth = 15,
max_bins = 255,
min_samples_leaf = 70,
max_leaf_nodes = 115).fit(X_train, Y_train)
hist_pred_1 = hist_md.predict(X_test[X_test['generated'] == 1])
hist_pred_2 = hist_md.predict(test_baseline)
hist_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], hist_pred_1)
hist_cv_scores.append(hist_score_fold)
hist_preds.append(hist_pred_2)
print('Fold', i, '==> HistGradient oof MAE is ==>', hist_score_fold)
#############
## XGBoost ##
#############
xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',
tree_method = 'exact',
colsample_bytree = 0.9,
gamma = 0.65,
learning_rate = 0.01,
max_depth = 7,
min_child_weight = 20,
n_estimators = 1000).fit(X_train, Y_train)
xgb_pred_1 = xgb_md.predict(X_test[X_test['generated'] == 1])
xgb_pred_2 = xgb_md.predict(test_baseline)
xgb_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], xgb_pred_1)
xgb_cv_scores.append(xgb_score_fold)
xgb_preds.append(xgb_pred_2)
print('Fold', i, '==> XGBoost oof MAE is ==>', xgb_score_fold)
6. ๋ชจ๋ธ๋ณ k-fold ์ํ ๊ฒฐ๊ณผ MAE ํ๊ท
# ๊ฐ ๋ชจ๋ธ๋ณ MAE ์ ์ ํ๊ท ๋ด๊ธฐ
gb_cv_score = np.mean(gb_cv_scores)
hist_cv_score = np.mean(hist_cv_scores)
xgb_cv_score = np.mean(xgb_cv_scores)
# ๋ชจ๋ธ๋ณ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ์ดํฐ ํ๋ ์์ผ๋ก ์ ์ฅ
model_perf = pd.DataFrame({'Model':['GradientBoosting', 'HistGradient', 'XGBoost'],
'cv-score': [gb_cv_score, hist_cv_score, xgb_cv_score]})
# ์ ๋ฐ์ดํฐ ํ๋ ์์ ์๊ฐํํ์ฌ ํ๋์ ํ์ธ
plt.figure(figsize = (8, 8))
ax = sns.barplot(y = 'Model', x ='cv-score', data = model_perf)
ax.bar_label(ax.containers[0]);
728x90