(4)~(5) ํฌ์คํ ์์ ์งํํ ๋ชจ๋ธ๋ง ์ฝ๋๋ฅผ ์ฐธ๊ณ ํ์ฌ ๋ ๋ฒ์งธ ๋ฒ ์ด์ค๋ผ์ธ ์ฝ๋๋ฅผ ์์ฑํ๋ค.
ํผ์ณ ์์ง๋์ด๋ง์ ์ํํ๊ณ , ๊ทธ ๊ฒฐ๊ณผ๋ catboost ๋ชจ๋ธ๋ก ์์ธก์ ์งํํ๋ค.
์ด์ธ ๋ค๋ฅธ ๋ชจ๋ธ๋ ํจ๊ป ์ฌ์ฉํ๊ณ k-fold๋ฅผ 10ํ ์งํํ์ฌ ๊ฐ ํด๋๋ง๋ค ๋ชจ๋ธ๋ณ ์ฑ๊ณผ๋ฅผ ํ์ธํ๋ค.
๐ ๋ชจ๋ธ๋ง ์ค๋น : ๋ณ์ ์ ํ & ์ธ์ฝ๋ฉ
X = train.drop(columns = ['Age'], axis = 1)
Y = train['Age']
# train ๋
๋ฆฝ๋ณ์์ ๋ํด ์ฃผ์ ํ์๋ณ์ ์์ฑ
X['Meat Yield'] = X['Shucked Weight'] / (X['Weight'] + X['Shell Weight'])
X['Shell Ratio'] = X['Shell Weight'] / X['Weight']
X['Weight_to_Shucked_Weight'] = X['Weight'] / X['Shucked Weight']
X['Viscera Ratio'] = X['Viscera Weight'] / X['Weight']
# test ๋ฐ์ดํฐ ์ ๋ฆฌ / ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['Sex'] = le.transform(test_baseline['Sex'])
# test ๋
๋ฆฝ๋ณ์๋ ๋์ผ ํ์๋ณ์ ์์ฑ
test_baseline['Meat Yield'] = test_baseline['Shucked Weight'] / (test_baseline['Weight'] + test_baseline['Shell Weight'])
test_baseline['Shell Ratio'] = test_baseline['Shell Weight'] / test_baseline['Weight']
test_baseline['Weight_to_Shucked_Weight'] = test_baseline['Weight'] / test_baseline['Shucked Weight']
test_baseline['Viscera Ratio'] = test_baseline['Viscera Weight'] / test_baseline['Weight']
๐ 5๊ฐ ๋ชจ๋ธ๋ณ & LAD ์์๋ธ ๊ฒฐ๊ณผ ์ ์ฅ์ฉ : MAE, ์์ธก ๊ฒฐ๊ณผ ๋ฆฌ์คํธ ์์ฑ
aml_cv_scores, aml_preds = list(), list()
gb_cv_scores, gb_preds = list(), list()
hist_cv_scores, hist_preds = list(), list()
lgb_cv_scores, lgb_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
cat_cv_scores, cat_preds = list(), list()
ens_cv_scores_1, ens_preds_1 = list(), list()
ens_cv_scores_2, ens_preds_2 = list(), list()
ens_cv_scores_3, ens_preds_3 = list(), list()
ens_cv_scores_4, ens_preds_4 = list(), list()
๐ K-fold ์์ฑ, ํด๋๋ณ ๋ชจ๋ธ๋ง ์ํ ๋ฐ ๊ฒฐ๊ณผ ํ์ธ
- ์ง๋ ๋ชจ๋ธ๋ง์์์ ๊ฐ์ด ๋ชจ๋ธ๋ง ์ฝ๋๋ฅผ ๋์ผํ๊ฒ ์์ฑ
- 5๊ฐ์ง ๋ชจ๋ธ ์ฌ์ฉ : Gradient Boosting, Hist Gradeint, LightBGM, XGBoost, CatBoost
- ์ฌ๊ธฐ์ ํ๋ผ๋ฏธํฐ๋ฅผ ์กฐ์ ํ 4๊ฐ์ง LAD ํ๊ท ๋ชจ๋ธ์ ์ถ๊ฐ๋ก ์ฌ์ฉ
skf = KFold(n_splits = 10, random_state = 42, shuffle = True)
for i, (train_ix, test_ix) in enumerate (skf.split(X, Y)):
X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
print('---------------------------------------------------------------')
######################
## GradientBoosting ##
######################
gb_features = ['Sex',
'Length',
'Diameter',
'Height',
'Weight',
'Shucked Weight',
'Viscera Weight',
'Shell Weight',
'generated']
# ๋ชจ๋ธ๋ง์ ์ฌ์ฉํ ์ฃผ์ ํผ์ณ๋ง ๋ฝ์ ์ ์ฅ
X_train_gb = X_train[gb_features]
X_test_gb = X_test[gb_features]
test_baseline_gb = test_baseline[gb_features]
gb_md = GradientBoostingRegressor(loss = 'absolute_error',
n_estimators = 1000,
max_depth = 8,
learning_rate = 0.01,
min_samples_split = 10,
min_samples_leaf = 20,
random_state = 42)
gb_md.fit(X_train_gb, Y_train)
gb_pred_1 = gb_md.predict(X_test_gb[X_test_gb['generated'] == 1])
gb_pred_2 = gb_md.predict(test_baseline_gb)
gb_score_fold = mean_absolute_error(Y_test[X_test_gb['generated'] == 1], gb_pred_1)
gb_cv_scores.append(gb_score_fold)
gb_preds.append(gb_pred_2)
print('Fold', i, '==> GradientBoositng oof MAE is ==>', gb_score_fold)
##########################
## HistGradientBoosting ##
##########################
hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',
l2_regularization = 0.01,
early_stopping = False,
learning_rate = 0.01,
max_iter = 1000,
max_depth = 15,
max_bins = 255,
min_samples_leaf = 70,
max_leaf_nodes = 115,
random_state = 42).fit(X_train, Y_train)
hist_pred_1 = hist_md.predict(X_test[X_test['generated'] == 1])
hist_pred_2 = hist_md.predict(test_baseline)
hist_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], hist_pred_1)
hist_cv_scores.append(hist_score_fold)
hist_preds.append(hist_pred_2)
print('Fold', i, '==> HistGradient oof MAE is ==>', hist_score_fold)
##############
## LightGBM ##
##############
lgb_md = LGBMRegressor(objective = 'mae',
n_estimators = 1000,
max_depth = 15,
learning_rate = 0.01,
num_leaves = 105,
reg_alpha = 8,
reg_lambda = 3,
subsample = 0.6,
colsample_bytree = 0.8,
random_state = 42).fit(X_train, Y_train)
lgb_pred_1 = lgb_md.predict(X_test[X_test['generated'] == 1])
lgb_pred_2 = lgb_md.predict(test_baseline)
lgb_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], lgb_pred_1)
lgb_cv_scores.append(lgb_score_fold)
lgb_preds.append(lgb_pred_2)
print('Fold', i, '==> LightGBM oof MAE is ==>', lgb_score_fold)
#############
## XGBoost ##
#############
xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',
tree_method = 'hist',
colsample_bytree = 0.9,
gamma = 0.65,
learning_rate = 0.01,
max_depth = 7,
min_child_weight = 20,
n_estimators = 1500,
subsample = 0.7,
random_state = 42).fit(X_train_gb, Y_train)
xgb_pred_1 = xgb_md.predict(X_test_gb[X_test_gb['generated'] == 1])
xgb_pred_2 = xgb_md.predict(test_baseline_gb)
xgb_score_fold = mean_absolute_error(Y_test[X_test_gb['generated'] == 1], xgb_pred_1)
xgb_cv_scores.append(xgb_score_fold)
xgb_preds.append(xgb_pred_2)
print('Fold', i, '==> XGBoost oof MAE is ==>', xgb_score_fold)
##############
## CatBoost ##
##############
cat_features = ['Sex',
'Length',
'Diameter',
'Height',
'Weight',
'Shucked Weight',
'Viscera Weight',
'Shell Weight',
'generated',
'Meat Yield',
'Shell Ratio',
'Weight_to_Shucked_Weight']
X_train_cat = X_train[cat_features]
X_test_cat = X_test[cat_features]
test_baseline_cat = test_baseline[cat_features]
cat_md = CatBoostRegressor(loss_function = 'MAE',
iterations = 1000,
learning_rate = 0.08,
depth = 10,
random_strength = 0.2,
bagging_temperature = 0.7,
border_count = 254,
l2_leaf_reg = 0.001,
verbose = False,
grow_policy = 'Lossguide',
task_type = 'CPU',
random_state = 42).fit(X_train_cat, Y_train)
cat_pred_1 = cat_md.predict(X_test_cat[X_test_cat['generated'] == 1])
cat_pred_2 = cat_md.predict(test_baseline_cat)
cat_score_fold = mean_absolute_error(Y_test[X_test_cat['generated'] == 1], cat_pred_1)
cat_cv_scores.append(cat_score_fold)
cat_preds.append(cat_pred_2)
print('Fold', i, '==> CatBoost oof MAE is ==>', cat_score_fold)
[ ๐ง๐ป LAD ํ๊ท ] for ๋ฌธ ์์ ์ด์ด์ง๋ ์ฝ๋์ด์ง๋ง, ์ ๋ฆฌ๋ฅผ ์ํด ์๋ ๋ฐ๋ก ์์ฑ
LAD ํ๊ท๋?
* Least Absolute Deviation์ผ๋ก, ์ด์์น์ ๋ ๋ฏผ๊ฐํ๊ณ ๋ฐ์ดํฐ ๋ถํฌ๊ฐ ์ ๊ท๋ถํฌ๋ฅผ ๋ฐ๋ฅด์ง ์์ ๋ ์ ์ฉํ๋ค
* ์์ฐจ์ ์ ๋๊ฐ ํฉ์ ์ต์ํํ์ฌ ๋ชจ๋ธ์ ์ ํฉ์ํค๋ ๋ฐฉ๋ฒ์ด๋ค
* ํ๊ท ๋ชจ๋ธ์ ์ ํฉ์ฑ ํ๊ฐ๋ฅผ ์ํด ์ฃผ๋ก ํ๊ท ์ ๋ ์ค์ฐจ(MAE)๋ฅผ ์ฌ์ฉํ๋ค.
- x : 5๊ฐ ๋ชจ๋ธ์ ์์ธก๊ฐ์ ์์์ ์ฒซ์งธ์ง๋ฆฌ์์ ๋ฐ์ฌ๋ฆผํ์ฌ, ๋ฐ์ดํฐ ํ๋ ์์ ์์ฑ
- y : test์ฉ train ๋ฐ์ดํฐ์ ์ข ์๋ณ์
- fit_intercept : ํ๊ท๋ชจ๋ธ์์ ์์ํญ์ ํ์ตํ ์ง ์ฌ๋ถ๋ฅผ ์ง์ (์์ํญ์ ํ๊ท ์ง์ ์ด ์์ ์ ํต๊ณผํ๋์ง ์ฌ๋ถ๋ฅผ ๊ฒฐ์ )
- True : ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์์ํญ์ ํฌํจํ์ฌ ํ๊ท ์ง์ ์ ํ์ตํจ. ์ผ๋ฐ์ ์ธ ์ํฉ์์ ์ฌ์ฉ๋๋ฉฐ ๋ฐ์ดํฐ๊ฐ ์์ ์ ์ค์ฌ์ผ๋ก ๋ถํฌํ์ง ์๋ ๊ฒฝ์ฐ ์ ์ฉํ๋ค.
- False : ๋ชจ๋ธ์ด ์์ํญ์ ๋ฌด์ํ๊ณ ํ๊ท ์ง์ ์ ์์ ์ผ๋ก ํต๊ณผํ๋๋ก ๊ฐ์ ํ๋ค. ์ด๋ ๊ฒ ์ค์ ํ๋ฉด ๋ฐ์ดํฐ๊ฐ ์์ ์ ์ค์ฌ์ผ๋ก ๋ฐ์ดํฐ๊ฐ ๋ถํฌํ ๊ฒ์ ๊ฐ์ ํ๋ค.
- fit_intercept๋ฅผ ์กฐ์ ํ์ฌ ๋ชจ๋ธ ํธํฅ์ ์กฐ์ ํ๊ณ , ๋ฐ์ดํฐ์ ๋ชจ๋ธ ๊ฐ ์ ํฉ๋๋ฅผ ๋ ์ ์กฐ์ ํ ์ ์๋ค.
- positive : ํ๊ท ๋ชจ๋ธ์์ ์์ธก๊ฐ์ด ์์๋ก ์ ํ๋์ด์ผ ํ๋์ง ์ฌ๋ถ.
- True : ๋ชจ๋ธ ์์ธก๊ฐ์ด ์์๋ก ์ ํ(์์ธก๊ฐ์ 0 ๋๋ ์์ ๊ฐ๋ง ๊ฐ์ง ์ ์์)
- False : ๋ชจ๋ธ ์์ธก๊ฐ์ ์ ํ์ ๋์ง ์์
- ๊ฐ๊ฒฉ ์์ธก๊ณผ ๊ฐ์ด ์์ธก๊ฐ์ด ์์๋ก๋ง ์ ํ๋์ด์ผ ํ๋ ๋ฌธ์ ์์ ๋ชจ๋ธ ์ ํจ์ฑ์ ๋์ผ ์ ์์
##################
## LAD Ensemble ##
##################
x = pd.DataFrame({'GBC': np.round(gb_pred_1.tolist()), 'hist': np.round(hist_pred_1.tolist()),
'lgb': np.round(lgb_pred_1.tolist()), 'xgb': np.round(xgb_pred_1.tolist()),
'cat': np.round(cat_pred_1.tolist())})
y = Y_test[X_test['generated'] == 1]
x_test = pd.DataFrame({'GBC': np.round(gb_pred_2.tolist()), 'hist': np.round(hist_pred_2.tolist()),
'lgb': np.round(lgb_pred_2.tolist()), 'xgb': np.round(xgb_pred_2.tolist()),
'cat': np.round(cat_pred_2.tolist())})
lad_md_1 = LADRegression(fit_intercept = True, positive = False).fit(x, y)
lad_md_2 = LADRegression(fit_intercept = True, positive = True).fit(x, y)
lad_md_3 = LADRegression(fit_intercept = False, positive = True).fit(x, y)
lad_md_4 = LADRegression(fit_intercept = False, positive = False).fit(x, y)
lad_pred_1 = lad_md_1.predict(x)
lad_pred_2 = lad_md_2.predict(x)
lad_pred_3 = lad_md_3.predict(x)
lad_pred_4 = lad_md_4.predict(x)
lad_pred_test_1 = lad_md_1.predict(x_test)
lad_pred_test_2 = lad_md_2.predict(x_test)
lad_pred_test_3 = lad_md_3.predict(x_test)
lad_pred_test_4 = lad_md_4.predict(x_test)
ens_score_1 = mean_absolute_error(y, lad_pred_1)
ens_cv_scores_1.append(ens_score_1)
ens_preds_1.append(lad_pred_test_1)
ens_score_2 = mean_absolute_error(y, lad_pred_2)
ens_cv_scores_2.append(ens_score_2)
ens_preds_2.append(lad_pred_test_2)
ens_score_3 = mean_absolute_error(y, lad_pred_3)
ens_cv_scores_3.append(ens_score_3)
ens_preds_3.append(lad_pred_test_3)
ens_score_4 = mean_absolute_error(y, lad_pred_4)
ens_cv_scores_4.append(ens_score_4)
ens_preds_4.append(lad_pred_test_4)
print('Fold', i, '==> LAD Model 1 ensemble oof MAE is ==>', ens_score_1)
print('Fold', i, '==> LAD Model 2 ensemble oof MAE is ==>', ens_score_2)
print('Fold', i, '==> LAD Model 3 ensemble oof MAE is ==>', ens_score_3)
print('Fold', i, '==> LAD Model 4 ensemble oof MAE is ==>', ens_score_4)
๐ ๊ฒฐ๊ณผ ํ์ธ
๐ ํด๋๋ณ ๊ฒฐ๊ณผ(MAE) ํ๊ท ๋ด์ด ๋ฐ์ดํฐํ๋ ์์ผ๋ก ํ์ธ, ์๊ฐํ
gb_cv_score = np.mean(gb_cv_scores)
hist_cv_score = np.mean(hist_cv_scores)
lgb_cv_score = np.mean(lgb_cv_scores)
xgb_cv_score = np.mean(xgb_cv_scores)
cat_cv_score = np.mean(cat_cv_scores)
ens_cv_score_1 = np.mean(ens_cv_scores_1)
ens_cv_score_2 = np.mean(ens_cv_scores_2)
ens_cv_score_3 = np.mean(ens_cv_scores_3)
ens_cv_score_4 = np.mean(ens_cv_scores_4)
model_perf = pd.DataFrame({'Model': ['GradientBoosting', 'HistGradient' ,'LightGBM', 'XGBoost', 'CatBoost',
'LDA Model 1',
'LDA Model 2',
'LDA Model 3',
'LDA Model 4'],
'cv-score': [gb_cv_score, hist_cv_score, lgb_cv_score, xgb_cv_score, cat_cv_score,
ens_cv_score_1,
ens_cv_score_2,
ens_cv_score_3,
ens_cv_score_4]})
plt.figure(figsize = (8, 8))
ax = sns.barplot(y = 'Model', x = 'cv-score', data = model_perf)
ax.bar_label(ax.containers[0]);
728x90