๐๋ฐ์ดํฐ ์ค๋ช : Previous_Application
- ์ด ์ผ๊ธ ๋ํ๋ ๋์ถ ์ ๋ณด์ ๋ฐ๋ผ ์ฐ์ฒด ์ฌ๋ถ๋ฅผ ์์ธกํ ์ ์๋ ๋ฉ์ธ train ๋ฐ์ดํฐ ์ธ์, ๊ณ ๊ฐ ๋ณ๋ก ์ด์ ๋์ถ ์ด๋ ฅ ํํฉ ๋ฐ์ดํฐ๋ ์ ๊ณตํ๊ณ ์๋ค.
- ์ด์ ๋์ถ์ด๋ ฅ ๋ฐ์ดํฐ๋ฅผ ๊ฐ๊ณตํ์ฌ, ๋ฉ์ธ ๋ฐ์ดํฐ ์ ๊ณผ ๊ฒฐํฉํ์ฌ ์์ธก ๊ฒฐ๊ณผ๊ฐ ๋์์ง๋์ง ํ์ธํด๋ณด์.
- ์์ธํ ๋ฐ์ดํฐ์ ๊ณผ ์ปฌ๋ผ ์ค๋ช ์ ์ผ๊ธ์์ ํ์ธํ์ธ์!
๋ฐ์ดํฐ ๋ก๋ฉ
prev = pd.read_csv('previous_application.csv')
print(prev.shape, apps.shape)
๋ฉ์ธ ๋ฐ์ดํฐ์ ๊ณผ ์กฐ์ธํ์ฌ key(ID) ๊ฐ ๊ธฐ์ค์ผ๋ก ์ฒดํฌ
- ๋ฉ์ธ ๋ฐ์ดํฐ์ ์ธ apps ๋ฐ์ดํฐ์ ์กฐ์ธ
- ์ด๋, ํค ๊ฐ์ธ SK_ID_CURR ๊ธฐ์ค์ผ๋ก merge ํ๋, indicator๋ฅผ ์ค์ ํ์ฌ ๋ ๋ฐ์ดํฐ์ id ์ฐจ์ด๊ฐ ์ด๋ป๊ฒ ๋๋์ง ํ์ธํ๋ค.
prev_app_outer = prev.merge(apps['SK_ID_CURR'], on = 'SK_ID_CURR', how = 'outer', indicator = True) #indicator ์ด๋์ชฝ ๋๋ฝ์ด ์๋์ง ํ์ธ
prev_app_outer['_merge'].value_counts()
์ฃผ์ Feature EDA
โถ๏ธ ์ซ์ํ ํผ์ณ๋ค์ ๋ถํฌ ํ์ธ(TARGET ๊ฐ์ ๋ฐ๋ผ)
# prev์ ๋ฉ์ธ ๋ฐ์ดํฐ์
์ target, ID ๊ฐ์ merge
app_prev= prev.merge(app_train[['SK_ID_CURR', 'TARGET']], on = 'SK_ID_CURR', how = 'left')
# ์ฐ์ํ ๋ณ์ ๋ถํฌ ์๊ฐํ
def show_hist_by_target(df, columns):
cond_1 = (df['TARGET'] == 1)
cond_0 = (df['TARGET'] == 0)
for column in columns:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), squeeze=False)
sns.violinplot(x='TARGET', y=column, data=df, ax=axs[0][0] )
sns.distplot(df[cond_0][column], ax=axs[0][1], label='0', color='blue')
sns.distplot(df[cond_1][column], ax=axs[0][1], label='1', color='red')
# ์ซ์ํ ์ปฌ๋ผ๋ง ์ถ์ถ
num_columns = app_prev.dtypes[app_prev.dtypes != 'object'].index.tolist()
# ์๊ฐํํ์ง ์์ ์ปฌ๋ผ์ ์ ์ธ(Id, target)
num_columns = [column for column in num_columns if column not in ['SK_ID_CURR', 'SK_ID_PREV', 'TARGET']]
show_hist_by_target(app_prev, num_columns)
โถ๏ธ ๋ช ๋ชฉํ ํผ์ณ๋ค์ ๋ถํฌ ํ์ธ(TARGET ๊ฐ์ ๋ฐ๋ผ)
object_columns = app_prev.dtypes[app_prev.dtypes=='object'].index.tolist()
# catplot์ผ๋ก ์๊ฐํ
def show_category_by_target(df, columns):
for column in columns:
chart = sns.catplot(x=column, col="TARGET", data=df, kind="count")
chart.set_xticklabels(rotation=65)
show_category_by_target(app_prev, object_columns)
โถ๏ธ ํ์๋ณ์ ์์ฑ
- ์ค์ํ ๋ณ์๋ค ๊ฐ์ ๊ด๊ณ๋ฅผ ํ์ ํ์ฌ, ์๋ฏธ์์ ๊ฒ์ด๋ผ ๊ธฐ๋ํ๋ ๋ณ์๋ฅผ ์์ฑ
# ๋์ถ ์ ์ฒญ ๊ธ์ก๊ณผ ์ค์ ๋์ถ์ก/๋์ถ ์ํ๊ธ์ก ์ฐจ์ด ๋ฐ ๋น์จ
prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']
- ๋ถํฌ ํ์ธ ์, ์ด์์น๊ฐ ์์์ ๊ฒฝ์ฐ ๋ค์๊ณผ ๊ฐ์ด ์ด์์น๋ฅผ ์นํํด์ค ํ, ๋ณ์ ์์ฑ
# ์ด์์น ์ ๊ฑฐ ํ,
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
# ์ฒซ๋ฒ์งธ ๋ง๊ธฐ์ผ๊ณผ ๋ง์ง๋ง ๋ง๊ธฐ์ผ๊น์ง์ ๊ธฐ๊ฐ
prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
- null ๊ฐ์ด ๋ง์ง๋ง, ์ค์ํ ๋ณ์์ธ ๊ฒฝ์ฐ๋ ์๋กญ๊ฒ ์์ฑ(์ : ์ด์์จ)
# ์ ๋ฉ๋ถ์ก * ํ์ =>> ์ด ๋์ถ์ํ์ก
all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
# ์ด์์ฌ = (๋์ถ์ํ์ก/๋์ถ์ก -1) / ๋์ถ์ํํ์
prev['PREV_INTERESTS_RATE'] = (all_pay / prev['AMT_CREDIT'] -1)/prev['CNT_PAYMENT']
โถ๏ธ ๊ธฐ์กด ํผ์ณ์ ์์ฑํ ํ์๋ณ์๋ค์ ๊ธฐ์ค์ผ๋ก aggregation
- ์ด๋ ๊ฒ ํ ์ปฌ๋ผ์ ๋ค์ํ ์ง๊ณํจ์๋ฅผ ์ ์ฉํ๋ ์ด์ ๋, ์ผ๋จ ์ด๋ค ์ปฌ๋ผ์ด ์ ์ฉํ ์ง ์์ง์ ๋ชจ๋ฅด๊ธฐ ๋๋ฌธ
# ์๋กญ๊ฒ ์์ฑ๋ ๋์ถ ์ ์ฒญ์ก ๋๋น ๋ค๋ฅธ ๊ธ์ก ์ฐจ์ด ๋ฐ ๋น์จ๋ก aggregation ์ํ. >> ์ผ๋จ ์ด๋ค ๊ฐ์ด ์ค์ํ ์ง ๋ชจ๋ฅด๋, ๋์ดํ๊ณ ๋์ค์ ํ์ธ, ์ญ์
agg_dict = {
# ๊ธฐ์กด ์ปฌ๋ผ.
'SK_ID_CURR':['count'],
'AMT_CREDIT':['mean', 'max', 'sum'],
'AMT_ANNUITY':['mean', 'max', 'sum'],
'AMT_APPLICATION':['mean', 'max', 'sum'],
'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
'DAYS_DECISION': ['min', 'max', 'mean'],
'CNT_PAYMENT': ['mean', 'sum'],
# ๊ฐ๊ณต ์ปฌ๋ผ
'PREV_CREDIT_DIFF':['mean', 'max', 'sum'],
'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
'PREV_GOODS_APPL_RATIO':['mean', 'max'],
'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
'PREV_INTERESTS_RATE':['mean', 'max']
}
prev_group = prev.groupby('SK_ID_CURR')
prev_amt_agg = prev_group.agg(agg_dict)
prev_amt_agg.columns = ['PREV_'+ ('_').join(column).upper() for column in prev_amt_agg.columns.ravel()]
728x90