1. ์ฐ์ํ ํผ์ณ์ ๋ถํฌ ์๊ฐํ(Target ๊ฐ์ ๋ฐ๋ผ)
- ํจ์๋ฅผ ์ด์ฉํ์ฌ, ์ปฌ๋ผ๋ณ๋ก target๊ฐ์ด 0, 1์ผ ๋ ์๊ฐํํ๊ธฐ
- violinplot, distplot ์ฌ์ฉ
- ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉฐ, target ์ ๋ฐ๋ผ ์ ์๋ฏธํ ์ฐจ์ด๊ฐ ๋๋ ํผ์ณ ํ์ธ >> ํผ์ณ ์ค์๋ ํ์
๐ ์ฐ๋ น๋๊ฐ ๋ฎ์(๋๋ ์ง์ฅ ๊ฒฝ๋ ฅ์ด ์ ์), ์์ก ๋์ถ ๊ฑด์์ ์ฐ์ฒด ๋น์ค์ด ๋์ ๋ณด์
def show_hist_by_target(df, columns):
# ํ๊ฒ ๊ฐ์ ๋ฐ๋ฅธ ์กฐ๊ฑด ์ง์
cond_1 = (df['TARGET'] == 1)
cond_0 = (df['TARGET'] == 0)
# ๊ทธ๋ํ ๊ทธ๋ฆฌ๊ธฐ
for column in columns:
print('column name: ', column) #ํ์ธ์ฉ
fig, axs = plt.subplots(figsize = (12, 4), nrows=1, ncols=2, squeeze=False)
sns.violinplot(x = 'TARGET', y = column, data = df, ax = axs[0][0])
sns.distplot(df[cond_1][column], label = '1', color = 'red', ax = axs[0][1]) #์๋ฆฌ์ฆ๋ก ๋ฃ์ด์ค์ผํจ
sns.distplot(df[cond_0][column], label = '0', color = 'blue', ax = axs[0][1])
show_hist_by_target(app_train, columns)
2. ๋ช ๋ชฉํ ํผ์ณ์ ๋ถํฌ ์๊ฐํ(Target ๊ฐ์ ๋ฐ๋ผ)
- countplot์ ์ด์ฉํ์ฌ, ๋ช ๋ชฉํ ํผ์ณ์ ํ์คํ ๊ทธ๋จ์ ํํ
# object ๋ฐ์ดํฐ ์ปฌ๋ผ๋ง ๋ฆฌ์คํธ๋ก
object_columns = app_train.dtypes[app_train.dtypes =='object'].index.to_list()
# ์๊ฐํ ํจ์
def show_count_by_target(df, columns):
cond_1 = (df['TARGET'] == 1)
cond_0 = (df['TARGET'] == 0)
for column in columns:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(18, 4), squeeze=False)
# countplot์ ์ด์ฉํ์ฌ category๊ฐ์ histogram ํํ
chart0 = sns.countplot(df[cond_0][column], ax=axs[0][0])
# x์ถ์ tick label๋ค์ด ๊ฐ ์ ํ์ด ๋ง์ผ๋ฏ๋ก 45๋๋ก ํ์ ํ์ฌ ํํ
chart0.set_xticklabels(chart0.get_xticklabels(), rotation=45)
chart1 = sns.countplot(df[cond_1][column], ax=axs[0][1])
chart1.set_xticklabels(chart1.get_xticklabels(), rotation=45)
show_count_by_target(app_train, object_columns)
- catplot()์ ์ด์ฉํ๋ฉด ๋ค์๊ณผ ๊ฐ์ด target ๊ฐ์ ๋ฐ๋ผ ํผ์ณ ๊ฐ์ ๋ถํฌ๋ฅผ ๋์ผํ y ์ ์์์ ๋น๊ต ๊ฐ๋ฅ
# y ๋์ , col์ ์จ์ฃผ๋ ๊ฒ ์ ์
sns.catplot(x = 'CODE_GENDER', col = 'TARGET', data = app_train, kind = 'count')
# ์๊ฐํ ํจ์ ์ ์
def show_category_by_target(df, columns):
for column in columns:
print('col name: ', column)
chart = sns.catplot(x = column, col = 'TARGET', data = df, kind = 'count')
chart.set_xticklabels(rotation = 65)
show_category_by_target(app_train, object_columns)
- ๊ฒฐ๊ณผ๋ฅผ ํ์ธํ์ฌ, ์ผ๋ถ ๊ฐ์ ๋ฐ์ดํฐํ๋ ์์ผ๋ก ํ์ธ
๐ ์๋ฅผ ๋ค์ด, ๋ฐ๋ก ์ ์ด๋ฏธ์ง๋ฅผ ๋ณด๋ฉด ๋์ถ ํ์ ๋๋น ์ฐ์ฒด ๋น์จ์ด ๋จ์ฑ์ด ์ฌ์ฑ ๋ณด๋ค ๋์ ๋ณด์ด๋ฏ๋ก ์ด๋ฅผ ํ์ธ
cond_1 = (app_train['TARGET'] == 1)
cond_0 = (app_train['TARGET'] == 0)
print(app_train['CODE_GENDER'].value_counts() / app_train.shape[0])
print(app_train[cond_1]['CODE_GENDER'].value_counts() / app_train[cond_1].shape[0])
print(app_train[cond_0]['CODE_GENDER'].value_counts() / app_train[cond_0].shape[0])
3. Target ๊ณผ ์ฃผ์ ์ปฌ๋ผ์ ์๊ด๊ด๊ณ ๋ถ์
- ์ฃผ์ ์ปฌ๋ผ์ ์ถ์ถํ๊ณ , corr()์ผ๋ก ์๊ด๊ณ์๋ฅผ ๋์ถ
- ํํธ๋งต์ ํตํด ํ์ธ
corr_columns = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
'DAYS_EMPLOYED','DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL', 'TARGET']
col_corr = app_train[corr_columns].corr()
col_corr
plt.figure(figsize = (9, 9))
sns.heatmap(col_corr, annot = True)
4. ์ด์์น ํ์ธ ๋ฐ ์ฒ๋ฆฌ
- ์์ ์ปฌ๋ผ๋ณ ๋ถํฌ ์๊ฐํ ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉฐ, ๋น์์์ ์ธ ์ด์์น(์๋ฅผ ๋ค๋ฉด, ๊ทผ์๊ธฐ๊ฐ 3๋ง๋ ..)๋ฅผ ์ฒ๋ฆฌ
# ์ด์์น ๋ฐ์ดํฐ ์ง์ ํ์ธ
## 365243์ด ๋งค์ฐ ๋ง์. ์ฝ 1000๋
์น์ ํด๋นํ๋ ๋ ์ง
app_train['DAYS_EMPLOYED'].value_counts()
## CODE_GENDER์ ๊ฒฝ์ฐ XNA๊ฐ 4๊ฑด ์ ๋์ธ๋ฐ, ๋ง์ง ์์ผ๋ฏ๋ก ๊ทธ๋๋ก ์ ์ง
app_train['CODE_GENDER'].value_counts()
# replace๋ก ๋์ฒด
app_train['DAYS_EMPLOYED'] = app_train['DAYS_EMPLOYED'].replace(365243, np.nan)
# ๊ฒฐ๊ณผ ํ์ธ
app_train['DAYS_EMPLOYED'].value_counts(dropna = False)
728x90