Machine Learning/scikit-learn
๊ฒฐ์ ํธ๋ฆฌ ์ค์ต - ์ฌ์ฉ์ ํ๋ ์ธ์ ๋ฐ์ดํฐ ๋ถ๋ฅ ์์
ISLA!
2023. 8. 21. 19:21
๐ง๐ป๐ป ์์ ์ค๋ช
- ์ค๋งํธํฐ ์ผ์๋ฅผ ์ฅ์ฐฉํ 30๋ช ์ ํ๋๋ฐ์ดํฐ๋ฅผ ์์ง
- ๊ฒฐ์ ํธ๋ฆฌ๋ฅผ ์ด์ฉํด ์ด๋ ํ ๋์์ธ์ง ์์ธก
๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ : feature.txt ํ์ผ ๋ก๋
ํผ์ฒ ์ธ๋ฑ์ค์ ํผ์ฒ๋ช ์ ๊ฐ์ง๊ณ ์์ผ๋ฏ๋ก ์ด๋ฅผ DataFrame์ผ๋ก ๋ก๋ฉํ์ฌ, ํผ์ฒ ๋ช ์นญ ํ์ธ
import pandas as pd
DATA_PATH = '/content/drive/MyDrive/data'
feature_name_df = pd.read_csv(DATA_PATH + '/human_activity/features.txt',sep='\s+',
header=None,names=['column_index','column_name'])
feature_name_df.head(1)
ํผ์ฒ๋ช index ์ ๊ฑฐ ํ, ํผ์ฒ๋ง ๋ฐ๋ก ์ ์ฅ
- ํผ์ฒ๋ช : ์ธ์ฒด์ ์์ง์๊ณผ ๊ด๋ จ๋ ์์ฑ์ ํ๊ท /ํ์คํธ์ฐจ๊ฐ x, y, z ์ถ ๊ฐ์ผ๋ก ๋์ด ์์
- ๊ทธ๋ฐ๋ฐ, ํผ์ฒ๋ช ์ ์ค๋ณต์ด ์์ด ์ด๋ฅผ ์ฒ๋ฆฌํด์ผ ํจ
feature_name = feature_name_df.iloc[:, 1].values.tolist()
feature_name[:10]
์ค๋ณต๋ ํผ์ฒ๋ช ํ์ธ
- ์๋ ์ฝ๋๋ฅผ ๋ณด๋ฉด ์ค๋ณต ํผ์ณ๊ฐ 42๊ฐ๋ ๋๋ ๊ฒ์ ์ ์ ์๋ค
- ๊ฐ๋จํ ์ค๋ณต ํผ์ณ 5๊ฐ์ ๋ฐ์ดํฐ๋ง ํ์ธํด๋ณด๋ฉด ์๋์ ๊ฐ๋ค
- ์ค๋ณต ํผ์ณ๋ฅผ ์ฒ๋ฆฌํ๊ธฐ ์ํด ํจ์๋ฅผ ์์ฑํด๋ณธ๋ค : ์๋ณธ ํผ์ฒ๋ช ์ ์ค๋ณต ํผ์ณ๋ช ์ _1, _2๋ฅผ ๋ถ์ฌ์ ๋ณ๊ฒฝ
feature_dup_df = feature_name_df.groupby('column_name').count()
print(feature_dup_df[feature_dup_df['column_index'] > 1].count())
feature_dup_df[feature_dup_df['column_index'] > 1].head()
์ค๋ณต๋ ํผ์ฒ๋ช ์ฒ๋ฆฌ ํจ์
- ๊ธฐ์กด ๋ฐ์ดํฐ ํ๋ ์์ ๋ฐ์, ์ค๋ณต ํผ์ณ๋ช
์ ์ฒ๋ฆฌํ๋ ํจ์ ์ ์
- feature_dup_df = ๊ธฐ์กด ๋ฐ์ดํฐํ๋ ์์ column_name์ผ๋ก ๊ทธ๋ฃจํํ์ฌ ๊ฐ ์ปฌ๋ผ๋ณ๋ก ๋์ ๊ฐ์๋ฅผ ์ผ๋ค >> dup_cut ๋ก ์ ์ฅ
- feature_dup_df ์ ์ธ๋ฑ์ค ๋ฆฌ์
- new_feature_dup_df = ๊ธฐ์กด ๋ฐ์ดํฐํ๋ ์๊ณผ feature_dup_df๋ฅผ ์ฐ๊ฒฐ (์์ฐํฐ ์กฐ์ธ)
- new_feature_dup_df['column_name'] = column_name๊ณผ dup_cnt ์ด๋ง ๋ฐ๋ก ๋นผ์ ๋๋ค ํจ์ ์ ์ฉ
- dup_cnt ๊ฐ์ด 1๋ณด๋ค ํฌ๋ฉด(์ฆ ์ค๋ณต ์ปฌ๋ผ์ด๋ฉด) column_name ๊ณผ dup_cnt ๊ฐ์ _๋ก ์ฐ๊ฒฐํ๊ธฐ
- 1๋ณด๋ค ํฌ์ง ์์ผ๋ฉด (์ค๋ณต ์ปฌ๋ผ์ด ์๋๋ฉด) ๊ทธ๋๋ก column_name ๋ฐํ
- new_feature_dup_df์ ์ธ๋ฑ์ค ์ปฌ๋ผ ์ญ์
- new_feature_dup_df ๋ฐํ
def get_new_feature_name_df(old_feature_name_df):
feature_dup_df = pd.DataFrame(data = old_feature_name_df.groupby('column_name').cumcount(), columns = ['dup_cnt'])
feature_dup_df = feature_dup_df.reset_index()
new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how = 'outer')
# ์ค๋ณต feature ๋ช
์ ๋ํด ์๋ณธ feature_1, _2 ์ถ๊ฐ
new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0] + '_' + str(x[1]) if x[1] > 0 else x[0], axis = 1)
new_feature_name_df = new_feature_name_df.drop(['index'], axis = 1)
return new_feature_name_df
function_test = get_new_feature_name_df(feature_name_df)
function_test.sample(5)
train ๋ฐ์ดํฐ์ test ๋ฐ์ดํฐ ์ฒ๋ฆฌ : ํจ์
- train๊ณผ test ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฌ์ค๊ธฐ
- ์์ ์ ์ํ ์ค๋ณต ํผ์ณ ์ฒ๋ฆฌ ํจ์ ์ ์ฉ
- ์ปฌ๋ผ๋ช ์ ๋ฆฌ์คํธ๋ก ์ถ์ถ
- ํ์ต ๋ฐ์ดํฐ์ ํ ์คํธ ๋ฐ์ดํฐ ์ ์ค์
import pandas as pd
def get_human_dataset( ):
# ๊ฐ ๋ฐ์ดํฐ ํ์ผ๋ค์ ๊ณต๋ฐฑ์ผ๋ก ๋ถ๋ฆฌ๋์ด ์์ผ๋ฏ๋ก read_csv์์ ๊ณต๋ฐฑ ๋ฌธ์๋ฅผ sep์ผ๋ก ํ ๋น.
DATA_PATH = '/content/drive/MyDrive/data'
feature_name_df = pd.read_csv(DATA_PATH + '/human_activity/features.txt',sep='\s+',
header=None,names=['column_index','column_name'])
# ์ค๋ณต๋ ํผ์ฒ๋ช
์ ์์ ํ๋ get_new_feature_name_df()๋ฅผ ์ด์ฉ, ์ ๊ท ํผ์ฒ๋ช
DataFrame์์ฑ.
new_feature_name_df = get_new_feature_name_df(feature_name_df)
# DataFrame์ ํผ์ฒ๋ช
์ ์ปฌ๋ผ์ผ๋ก ๋ถ์ฌํ๊ธฐ ์ํด ๋ฆฌ์คํธ ๊ฐ์ฒด๋ก ๋ค์ ๋ณํ
feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
# ํ์ต ํผ์ฒ ๋ฐ์ดํฐ ์
๊ณผ ํ
์คํธ ํผ์ฒ ๋ฐ์ดํฐ์ DataFrame์ผ๋ก ๋ก๋ฉ. ์ปฌ๋ผ๋ช
์ feature_name ์ ์ฉ
X_train = pd.read_csv(DATA_PATH + '/human_activity/train/X_train.txt',sep='\s+', names=feature_name )
X_test = pd.read_csv(DATA_PATH + '/human_activity/test/X_test.txt',sep='\s+', names=feature_name)
# ํ์ต ๋ ์ด๋ธ๊ณผ ํ
์คํธ ๋ ์ด๋ธ ๋ฐ์ดํฐ์ DataFrame์ผ๋ก ๋ก๋ฉํ๊ณ ์ปฌ๋ผ๋ช
์ action์ผ๋ก ๋ถ์ฌ
y_train = pd.read_csv(DATA_PATH + '/human_activity/train/y_train.txt',sep='\s+',header=None,names=['action'])
y_test = pd.read_csv(DATA_PATH + '/human_activity/test/y_test.txt',sep='\s+',header=None,names=['action'])
# ๋ก๋๋ ํ์ต/ํ
์คํธ์ฉ DataFrame์ ๋ชจ๋ ๋ฐํ
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_human_dataset()
train ๋ฐ์ดํฐ์ ํ์ธ
- ์ฝ 7000๊ฐ์ ๋ ์ฝ๋, 561๊ฐ์ ํผ์ณ(์ปฌ๋ผ) ๊ฐ์ง๊ณ ์์
- ํผ์ณ๊ฐ ์ ๋ถ ์ค์ํ(์ซ์)์ด๋ฏ๋ก, ๋ณ๋ ์นดํ ๊ณ ๋ฆฌ ์ธ์ฝ๋ฉ ํ์ ์์
label(๊ฒฐ๊ณผ) ๋ฐ์ดํฐ์ ํ์ธ
- 1๋ถํฐ 6๊น์ง ์ฌ์ฏ๊ฐ์ ๊ฐ์ด ์์ผ๋ฉฐ ๋น๊ต์ ๊ณ ๋ฅด๊ฒ ๋ถํฌ๋์ด ์์
DecisionTreeClassifier ๋ก ๋์ ์์ธก ๋ถ๋ฅ
- ๊ธฐ๋ณธ์ ์ผ๋ก ํ์ดํผํ๋ผ๋ฏธํฐ๋ฅผ ์๋์ง ์๊ณ ์์ธกํ ๊ฒฐ๊ณผ์ ์ ํ๋์ ํ๋ผ๋ฏธํฐ๋ฅผ ์ถ๋ ฅํด๋ณด์
- ์ ํ๋๋ ์ฝ 85.48%
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# ์์ ๋ฐ๋ณต ์ ๋ง๋ค ๋์ผํ ์์ธก ๊ฒฐ๊ณผ ๋์ถ์ ์ํด random_state ์ค์
dt_clf = DecisionTreeClassifier(random_state=156)
dt_clf.fit(X_train , y_train)
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test , pred)
print('๊ฒฐ์ ํธ๋ฆฌ ์์ธก ์ ํ๋: {0:.4f}'.format(accuracy))
# DecisionTreeClassifier์ ํ์ดํผ ํ๋ผ๋ฏธํฐ ์ถ์ถ
print('DecisionTreeClassifier ๊ธฐ๋ณธ ํ์ดํผ ํ๋ผ๋ฏธํฐ:\n', dt_clf.get_params())
๊ฒฐ๊ณผ
๊ฒฐ์ ํธ๋ฆฌ ์์ธก ์ ํ๋: 0.8548
DecisionTreeClassifier ๊ธฐ๋ณธ ํ์ดํผ ํ๋ผ๋ฏธํฐ: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 156, 'splitter': 'best'}
GridSearchCV ๋ก ์ต์ ์ ํ๋ผ๋ฏธํฐ ์ฐพ๊ธฐ
from sklearn.model_selection import GridSearchCV
params = {
'max_depth' : [6, 16, 24],
'min_samples_split': [16]
}
grid_cv = GridSearchCV(dt_clf, param_grid = params, scoring = 'accuracy', cv = 5, verbose = 1)
grid_cv.fit(X_train, y_train)
๊ฒฐ๊ณผ ํ์ธ
- ์ ํ๋ ์์น๊ฐ 84.86% ๊น์ง ์ฌ๋ผ๊ฐ
print('์ ํ๋ ์์น', grid_cv.best_score_)
print('์ต์ ํ๋ผ๋ฏธํฐ', grid_cv.best_params_)
728x90