1. ์ด์ application(๋ฉ์ธ๋ฐ์ดํฐ) ์ Feature Engineering ํจ์ ๋ณต์ฌ
def get_apps_processed(apps):
# EXT_SOURCE_X FEATURE ๊ฐ๊ณต
apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
# AMT_CREDIT ๋น์จ๋ก Feature ๊ฐ๊ณต
apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
# AMT_INCOME_TOTAL ๋น์จ๋ก Feature ๊ฐ๊ณต
apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']
apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']
# DAYS_BIRTH, DAYS_EMPLOYED ๋น์จ๋ก Feature ๊ฐ๊ณต
apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
return apps
โถ๏ธ ์ด์ ๋์ถ ๋ฐ์ดํฐ(prev) ๋ฉ์ธ๋ฐ์ดํฐ(apps)๋ฅผ merge
prev_app_outer = prev.merge(apps['SK_ID_CURR'], on = 'SK_ID_CURR', how = 'outer', indicator = True)
๐ merge ํ ๋ indicator = True ๋ก ์กฐ์ธ ํ, ๋๋ฝ๋ ์งํฉ์ ํ์ธํ ์ ์์.
prev_app_outer['_merge'].value_counts()
๐ _merge ์ปฌ๋ผ์ value_counts() ํ์ฌ ๊ฐ๊ฐ ์ด๋ป๊ฒ ์กฐ์ธ๋์ด์๋์ง ํ์ธ
2. ์ฃผ์ ์ปฌ๋ผ EDA
โถ๏ธ apps(๋ฉ์ธ๋ฐ์ดํฐ)์ ID๋ณ๋ก, ์ด์ ๋์ถ ์ด๋ ฅ ๊ฑด์๋ฅผ ํ์ธ
- groupby, boxplot
prev.groupby('SK_ID_CURR')['SK_ID_PREV'].count()
โถ๏ธ ์ซ์ํ ํผ์ณ๋ค์ ๋ถํฌ ํ์ธ(ํ์คํ ๊ทธ๋จ) / Target ์ ํ์ ๋ฐ๋ผ
- ๋ฉ์ธ๋ฐ์ดํฐ์ ID, Target ๊ฐ๊ณผ prev(๊ณผ๊ฑฐ ๋์ถ์ด๋ ฅ) ๋ฐ์ดํฐ merge
app_prev= prev.merge(app_train[['SK_ID_CURR', 'TARGET']], on = 'SK_ID_CURR', how = 'left')
app_prev.shape
- Target ๊ฐ์ ๋ฐ๋ผ ํ์คํ ๊ทธ๋จ์ ๊ทธ๋ฆฌ๋ ํจ์
def show_hist_by_target(df, columns):
cond_1 = (df['TARGET'] == 1)
cond_0 = (df['TARGET'] == 0)
for column in columns:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), squeeze=False)
sns.violinplot(x='TARGET', y=column, data=df, ax=axs[0][0] )
sns.distplot(df[cond_0][column], ax=axs[0][1], label='0', color='blue')
sns.distplot(df[cond_1][column], ax=axs[0][1], label='1', color='red')
- ์ซ์ํ ํผ์ณ๋ค์ ์ปฌ๋ผ๋ช ๋ถ๋ฆฌ
- ๊ธฐ์กด ์ปฌ๋ผ๋ค์ค ID, target ์ปฌ๋ผ์ ์ ์ธํ๊ณ ๋ฆฌ์คํธ์ ์ ์ฅ
num_columns = app_prev.dtypes[app_prev.dtypes != 'object'].index.tolist()
num_columns = [column for column in num_columns if column not in ['SK_ID_CURR', 'SK_ID_PREV', 'TARGET']]
- ํ์คํ ๊ทธ๋จ ์๊ฐํ ๋ฐ ๊ฒฐ๊ณผ ํด์
show_hist_by_target(app_prev, num_columns)
โถ๏ธ ์นดํ ๊ณ ๋ฆฌํ ํผ์ณ๋ค์ ๋ถํฌ ํ์ธ(ํ์คํ ๊ทธ๋จ) / Target ์ ํ์ ๋ฐ๋ผ
object_columns = app_prev.dtypes[app_prev.dtypes=='object'].index.tolist()
object_columns
- ์นดํ ๊ณ ๋ฆฌํ ์ปฌ๋ผ์ ๐ catplot์ ์ฌ์ฉ!!
def show_category_by_target(df, columns):
for column in columns:
chart = sns.catplot(x=column, col="TARGET", data=df, kind="count")
chart.set_xticklabels(rotation=65)
show_category_by_target(app_prev, object_columns)
โถ๏ธ groupby ๋ก ํผ์ณ ํ์ธํ๊ธฐ
- agg_dict ๋ฅผ ์ ์ํ๊ณ , groupby ๊ฒฐ๊ณผ์ ํ๋ฒ์ ์ ์ฉ
agg_dict = {
# ๊ธฐ์กด ์ปฌ๋ผ.
'SK_ID_CURR':['count'],
'AMT_CREDIT':['mean', 'max', 'sum'],
'AMT_ANNUITY':['mean', 'max', 'sum'],
'AMT_APPLICATION':['mean', 'max', 'sum'],
'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
'AMT_GOODS_PRICE':['mean', 'max', 'sum']
}
prev_group = prev.groupby('SK_ID_CURR')
prev_amt_agg = prev_group.agg(agg_dict)
prev_amt_agg.head() #๊ฒฐ๊ณผ๋ ๋ฉํฐ๋ ๋ฒจ ์ธ๋ฑ์ค(์ปฌ๋ผ)
- ๋ฉํฐ ์ธ๋ฑ์ค ์ปฌ๋ผ๋ณ๊ฒฝ >> ์ธ๋๋ฐ('_')๋ก ์ฐ๊ฒฐํด์ ์ปฌ๋ผ๋ช ๊น๋ํ๊ฒ ์ ๋ฆฌ
# ์ปฌ๋ผ์ด ๋ฉํฐ์ธ๋ฑ์ค์ธ๊ฑฐ ํ์ธ
prev_amt_agg.columns
prev_amt_agg.columns =
['PREV_' + ('_').join(column).upper() for column in prev_amt_agg.columns]
prev_amt_agg.head()
โถ๏ธ prev ํผ์ณ ๊ฐ๊ณต
# ๋์ถ ์ ์ฒญ ๊ธ์ก๊ณผ ์ค์ ๋์ถ์ก/๋์ถ ์ํ๊ธ์ก ์ฐจ์ด ๋ฐ ๋น์จ
prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']
โถ๏ธ ์ด์์น๋ฅผ Null๋ก ๋์ฒด
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace = True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# ์ฒซ๋ฒ์งธ ๋ง๊ธฐ์ผ๊ณผ ๋ง์ง๋ง ๋ง๊ธฐ์ผ๊น์ง์ ๊ธฐ๊ฐ
prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
โถ๏ธ ์ด์์จ์ null ๊ฐ์ด ๋ง์ผ๋ฏ๋ก, ์ง์ ์ปฌ๋ผ ์์ฑ
# ์ ๋ฉ๋ถ์ก * ํ์ =>> ์ด ๋์ถ์ํ์ก
all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
# ์ด์์ฌ = (๋์ถ์ํ์ก/๋์ถ์ก -1) / ๋์ถ์ํํ์
prev['PREV_INTERESTS_RATE'] = (all_pay / prev['AMT_CREDIT'] -1)/prev['CNT_PAYMENT']
=>>> ์์ฑ๋ ์ปฌ๋ผ ํ์ธ
prev.iloc[:, -7:].head(10)
โถ๏ธ ID ๊ธฐ์ค์ผ๋ก (๊ธฐ์กด)๋์ถ ์ํ๊ฐ refused ์ธ ๊ฒฝ์ฐ์ ๊ฑด์ ๋ฐ ๊ณผ๊ฑฐ ๋์ถ ๊ฑด ๋๋น ๋น์จ ๊ตฌํ๊ธฐ
- ๐ง ๋ฉ์ธ ๋ฐ์ดํฐ์์ ๋์ถ์ํ๊ฐ '๊ฑฐ์ '์ธ ๊ฒฝ์ฐ์๋ ๊ณผ๊ฑฐ ๋์ถ ๊ฑด ์์ ์ด๋ค ๊ด๋ จ์ด ์๋์ง ์์๋ณด์
# ์กฐ๊ฑด
cond_refused = (prev['NAME_CONTRACT_STATUS'] == 'Refused')
# ์กฐ๊ฑด ์ ์ฉ
prev_refused = prev[cond_refused]
# ํ์ธ
prev_refused.shape, prev.shape
##((138377, 44), (798432, 44))
- ์กฐ๊ฑด์ ์ ์ฉํ ๋ฐ์ดํฐํ๋ ์์ refused ๊ฑด์ ๋์ถ
- groupby, count
- pd.DataFrame()์ผ๋ก groupby ๊ฒฐ๊ณผ๋ฅผ ํ์ธ
prev_refused_agg = prev_refused.groupby('SK_ID_CURR')['SK_ID_CURR'].count()
prev_refused_agg.shape, prev_amt_agg.shape
##((79325,), (281965, 39))
pd.DataFrame(prev_refused_agg) # reset_index()๊ฐ ์๋ ์ํ
- reset_index()๋ก ์ธ๋ฑ์ค์ ์ปฌ๋ผ๋ช ์ค๋ณต ํ์ด์ฃผ๊ธฐ
- name์ ์ง์ ํ์ฌ ์ปฌ๋ผ๋ช ์ ๋ณ๊ฒฝ
prev_refused_agg = prev_refused_agg.reset_index(name = 'PREV_REFUSED_COUNT')
prev_refused_agg.head()
โถ๏ธ ์์์ ๋ง๋ prev_amt_agg ์ปฌ๋ผ๊ณผ ๊ฒฐํฉ
prev_amt_agg = prev_amt_agg.reset_index()
prev_amt_refused_agg = prev_amt_agg.merge(prev_refused_agg, on = 'SK_ID_CURR', how = 'left')
- ๊ฒฐ์ธก์น ์ฒ๋ฆฌ :
# ๊ฒฐ์ธก์น ๊ฐ์๊น์ง ํ์ธ
prev_amt_refused_agg['PREV_REFUSED_COUNT'].value_counts(dropna = False)
# ๊ฒฐ์ธก์น๋ฅผ 0์ผ๋ก ์ฑ์
prev_amt_refused_agg = prev_amt_refused_agg.fillna(0)
- ๊ณผ๊ฑฐ ๋์ถ ๊ฑด ์ ๋๋น ๋์ถ ๊ฑฐ์ ๊ฑด ์
๐ ๊ฑฐ์ ๋์ถ ๊ฑด ์ / ๊ณผ๊ฑฐ ๋์ถ ๊ฑด์ >> ๋น์จ ์ฐ์ถ
prev_amt_refused_agg['PREV_REFUSED_RATIO'] = prev_amt_refused_agg['PREV_REFUSED_COUNT'] / prev_amt_refused_agg['PREV_SK_ID_CURR_COUNT']
prev_amt_refused_agg.head(10)
[TIP โค๏ธ] Case When + groupby ๋จ๊ณ๋ฅผ ์ํํ๋ ๋ค๋ฅธ ๋ฐฉ๋ฒ (unstack ์ฌ์ฉ)
์) ๋์ถ ์ ์ฒญ ์ํ๊ฐ '์น์ธ', '๊ฑฐ์ '์ธ ๊ฒฝ์ฐ์ ๊ฐ ID ๋ณ๋ก ๋ช ๊ฑด์ธ์ง ํ์ธ
- ์๋๊ณผ ๊ฐ์ด isin()์ ๋์ถ์ํ๋ฅผ ์ง์ ํ๊ณ , groupby()์ ๋ ๊ฐ ์ปฌ๋ผ์ ๋ฃ์ด ๊ฐ ๊ฒฝ์ฐ์ ์๋ฅผ ์ ์ ์์
- groupby ํ ๋, ๋ง์ง๋ง์ unstack()์ ํด์ค์ผ ํจ
prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].
groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
# ๊ฒฐ์ธก์น ์ฑ์ฐ๊ณ ์ปฌ๋ผ๋ช
๋ณ๊ฒฝ
prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT']
# ๋ฆฌ์
์ธ๋ฑ์ค๋ก ์ ๋ฆฌ
prev_refused_appr_agg = prev_refused_appr_agg.reset_index()
prev_refused_appr_agg.head(3)
- prev_amt_agg์ ์กฐ์ธ ํ ๋ฐ์ดํฐ ๊ฐ๊ณต
prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on = 'SK_ID_CURR', how = 'left')
# SK_ID_CURR๋ณ ๊ณผ๊ฑฐ ๋์ถ๊ฑด์ ๋๋น APPROVED_COUNT ๋ฐ REFUSED_COUNT ๋น์จ ์์ฑ.
prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
# 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' ์ปฌ๋ผ drop
prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
# prev_amt_agg์ prev_refused_appr_agg INDEX์ธ SK_ID_CURR์ด ์กฐ์ธ ํ ์ ์ ์ปฌ๋ผ์ผ๋ก ์์ฑ๋จ.
prev_agg.head(3)
728x90