#데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/breast-cancer.csv')

# 타깃변수 확인
print(df['diagnosis'].value_counts())

# 범주형 종속변수를 이산형으로 변환
import numpy as np
df['diagnosis'] = np.where(df['diagnosis']=='M',1,0)

#'area_mean', 'area_worst' 변수를 설명변수로 설정하고
# 학습데이터 평가데이터  7:3 으로 분할
x = df[['area_mean','area_worst']]
y = df['diagnosis']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#BaggingClassifier을 사용해 분류기를 생성하여 예측을 수행
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(estimator = DecisionTreeClassifier(), oob_score=True)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(pred)

#혼동 행렬을 사용해 성능평가를 해보자
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)
test_pre = precision_score(y_test, pred)
test_recall = recall_score(y_test, pred)
test_f1 = f1_score(y_test, pred)

print(test_cm)
print(test_acc)
print(test_pre)
print(test_recall)
print(test_f1)

B    357
M    212
Name: diagnosis, dtype: int64
(398, 2) (171, 2) (398,) (171,)
[0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1 1 0 0 1
 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1
 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1
 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0]
[[103   4]
 [  9  55]]
0.9239766081871345
0.9322033898305084
0.859375
0.8943089430894309

C:\Users\82108\anaconda3\lib\site-packages\sklearn\ensemble\_bagging.py:789: UserWarning: Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.
  warn(
C:\Users\82108\anaconda3\lib\site-packages\sklearn\ensemble\_bagging.py:795: RuntimeWarning: invalid value encountered in true_divide
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_scores = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
auc_score = roc_auc_score(y_test, y_scores)

plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc_score))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()


#데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/CarPrice_Assignment.csv')
display(df.head())

def solution(x):
    for i in x:
        if i=='-':
            x=x.split('-')[0]
            break
        
        elif i==' ':
            x=x.split(' ')[0]
            break
    return x
df['CarName'] = df['CarName'].apply(solution)

#object타입 컬럼 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# object 타입의 컬럼에 대해 라벨 인코딩 수행
for column in df.select_dtypes(include='object'):
    df[column] = label_encoder.fit_transform(df[column])

display(df.head())


x=df.drop(columns=['car_ID','symboling','price'])
y=df['price']


#데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#데이터 모델링
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

reg = BaggingRegressor(base_estimator = DecisionTreeRegressor(), oob_score=True)
reg.fit(X_train, y_train)
pred = reg.predict(X_test)

print(reg.oob_score_)

#회귀모델이니 mse,mae,rmse 와같은 지표로 성능평가 해보자
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)

print(mse)
print(mae)
print(rmse)

(143, 23) (62, 23) (143,) (62,)
0.9209611135536796
4567203.398683872
1367.2279559139786
2137.1016350852087

C:\Users\82108\anaconda3\lib\site-packages\sklearn\ensemble\_base.py:166: FutureWarning: `base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.
  warnings.warn(
C:\Users\82108\anaconda3\lib\site-packages\sklearn\ensemble\_bagging.py:1253: UserWarning: Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.
  warn(


#데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/breast-cancer.csv')
display(df)

# object 타입 컬럼 라벨인코딩 수행
from sklearn.preprocessing import LabelEncoder
#LabelEncoder 객체 생성
label_encoder = LabelEncoder()
#object 타입의 컬럼에 대해 라벨 인코딩 수행
for i in df.select_dtypes(include='object'):
    df[i] = label_encoder.fit_transform(df[i])

x=df[['area_mean','texture_mean']]    
y=df['diagnosis']

#데이터 분할 (학습데이터7, 평가데이터3)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#AdaBoostClassifire로 모델을 생성후 데이터 학습
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# 기본 분류기로 DecisionTreeClassifier의 인스턴스를 생성
base_classifier = DecisionTreeClassifier()

# AdaBoostClassifier로 모델을 생성하고 데이터를 학습
clf = AdaBoostClassifier(base_estimator=base_classifier)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('정확도: ', clf.score(X_test, y_test))

#분류분석이기에 혼동행렬을 통한 성능지표로 부스팅 모델의 예측성능 확인
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)
test_pre = precision_score(y_test, pred)
test_recall = recall_score(y_test, pred)
test_f1 = f1_score(y_test, pred)

print(test_cm)
print(test_acc)
print(test_pre)
print(test_recall)
print(test_f1)

(398, 2) (171, 2) (398,) (171,)
정확도:  0.8771929824561403
[[98  9]
 [12 52]]
0.8771929824561403
0.8524590163934426
0.8125
0.8319999999999999

C:\Users\82108\anaconda3\lib\site-packages\sklearn\ensemble\_base.py:166: FutureWarning: `base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.
  warnings.warn(


import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_scores = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
auc_score = roc_auc_score(y_test, y_scores)

plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc_score))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()


importances = clf.feature_importances_
column_nm = pd.DataFrame(['area_mean','texture_mean'])
feature_importances = pd.concat([column_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
feature_importances


#데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/CarPrice_Assignment.csv')
df

#자동차 회사명만 가져오기(ex. 아우디 볼보...)
def solution(x):
    for i in x:
        if i==' ':
            return x.split(' ')[0]
            
        elif i=='-':
            return x.split('-')[0]
df['CarName']  = df['CarName'].apply(solution)

#object타입 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for i in df.select_dtypes(include='object'):
    df[i] = label_encoder.fit_transform(df[i])

x = df.drop(columns = ['car_ID','symboling','price'])
y = df['price']

#데이터 분할 (학습데이터7, 평가데이터3)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, y_trian.shape, y_test.shape)


#AdaBoostRegressor을 이용해 회귀분석을 수행!!!
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

Adareg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
Adareg.fit(X_train, y_train)

#predict 메서드로 test데이터 예측후 회귀분석이므로
#mse. mae rmse 와같은 성능지표로 모델의 예측력을 확인
pred = Adareg.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
acc = reg.score(X_test, y_test)

print(mse)
print(mae)
print(rmse)
print(acc)

(143, 23) (62, 23) (143,) (62,)
5787641.991935484
1505.4193548387098
2405.7518558520296
0.924365988327351

C:\Users\82108\anaconda3\lib\site-packages\sklearn\ensemble\_base.py:166: FutureWarning: `base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.
  warnings.warn(


importances = Adareg.feature_importances_
column_nm = pd.DataFrame(x.columns)
feature_importances = pd.concat([column_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm','importances']
feature_importances = feature_importances.sort_values(by='importances', ascending=False)
feature_importances


#데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/breast-cancer.csv')
display('유방암 데이터', df.head())

#object 컬럼 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for i in df.select_dtypes(include='object'):
    df[i] = label_encoder.fit_transform(df[i])
    
x = df[['area_mean', 'texture_mean']]
y = df['diagnosis']

#데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, stratify=y, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#RandomForestClassifier로 모델을 생성후, fit메서드로 데이터 학습
from sklearn.ensemble import RandomForestClassifier
randomcl = RandomForestClassifier(n_estimators=100, min_samples_split=5, oob_score=True)
randomcl.fit(X_train, y_train)

#predict메서드로 테스트 데이터 예측수행후 
#분류이므로 혼동행렬을 사용한 성능평가지표 확인
pred = randomcl.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)
test_pre = precision_score(y_test, pred)
test_reacll = recall_score(y_test, pred)
test_f1 = f1_score(y_test, pred)
#score메서드 : 분류 모델에서 정확도를 계산하는 메서드
test_score = randomcl.score(X_test, y_test)

print(test_cm)
print(test_acc)
print(test_pre)
print(test_reacll)
print(test_f1)
print(test_score)

'유방암 데이터'

(398, 2) (171, 2) (398,) (171,)
[[101   6]
 [ 13  51]]
0.8888888888888888
0.8947368421052632
0.796875
0.8429752066115702
0.8888888888888888


import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_scores = randomcl.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
auc_score = roc_auc_score(y_test, y_scores)

plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc_score))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()


importances = randomcl.feature_importances_
column_nm = pd.DataFrame(x.columns)
feature_importances = pd.concat([column_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
feature_importances


#데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/CarPrice_Assignment.csv')
display('-----------자동차 가격 데이터-----------', df.head())

#자동차명 로고명만 가져오기(ex)아우디, 볼보...)
def solution(x):
    for i in x:
        if i == ' ':
            return x.split(' ')[0]
        elif i == '-':
            return x.split('-')[0]
df['CarName'] = df['CarName'].apply(solution)

#object타입 컬럼 라벨인코딩 수행
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

for i in df.select_dtypes(include='object'):
    df[i] = labelencoder.fit_transform(df[i])

x = df.drop(columns=['car_ID','symboling','price'])
y = df['price']

#데이터 분할(학습데이터7, 평가데이터3)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#RandomForestRegressor를 이용해 회귀분석을 수행. fit메서드로 데이터 학습
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(oob_score=True)
RFR.fit(X_train, y_train)

#predict 메서드로 테스트 데이터 예측수행후
#회귀이므로 mse,mae,rmse와 같은 회귀분석 성능지표로 예측력 확인
pred = RFR.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
score = RFR.score(X_test,y_test)

print(mse)
print(mae)
print(rmse)
print(score)

'-----------자동차 가격 데이터-----------'

(143, 23) (62, 23) (143,) (62,)
4208546.538701836
1344.5410967741936
2051.474235446752
0.9302881675477688


importances = RFR.feature_importances_
columns_nm = pd.DataFrame(x.columns)
feature_importances = pd.concat([columns_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
feature_importances = feature_importances.sort_values(by='importances', ascending=False)
feature_importances

	car_ID	symboling	CarName	fueltype	aspiration	doornumber	carbody	drivewheel	enginelocation	wheelbase	...	enginesize	fuelsystem	boreratio	stroke	compressionratio	horsepower	peakrpm	citympg	highwaympg	price
0	1	3	alfa-romero giulia	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	13495.0
1	2	3	alfa-romero stelvio	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	16500.0
2	3	1	alfa-romero Quadrifoglio	gas	std	two	hatchback	rwd	front	94.5	...	152	mpfi	2.68	3.47	9.0	154	5000	19	26	16500.0
3	4	2	audi 100 ls	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.40	10.0	102	5500	24	30	13950.0
4	5	2	audi 100ls	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450.0

	car_ID	symboling	CarName	fueltype	doornumber	carbody	drivewheel	wheelbase	...	enginesize	fuelsystem	boreratio	stroke	compressionratio	horsepower	peakrpm	citympg	highwaympg	price
0	1	3	1	1	1	0	2	88.6	...	130	5	3.47	2.68	9.0	111	5000	21	27	13495.0
1	2	3	1	1	1	0	2	88.6	...	130	5	3.47	2.68	9.0	111	5000	21	27	16500.0
2	3	1	1	1	1	2	2	94.5	...	152	5	2.68	3.47	9.0	154	5000	19	26	16500.0
3	4	2	2	1	0	3	1	99.8	...	109	5	3.19	3.40	10.0	102	5500	24	30	13950.0
4	5	2	2	1	0	3	0	99.4	...	136	5	3.19	3.40	8.0	115	5500	18	22	17450.0

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	...	25.380	17.33	184.60	2019.0	0.16220	0.66560	0.7119	0.2654	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	...	24.990	23.41	158.80	1956.0	0.12380	0.18660	0.2416	0.1860	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	...	23.570	25.53	152.50	1709.0	0.14440	0.42450	0.4504	0.2430	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	...	14.910	26.50	98.87	567.7	0.20980	0.86630	0.6869	0.2575	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	...	22.540	16.67	152.20	1575.0	0.13740	0.20500	0.4000	0.1625	0.2364	0.07678
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	926424	M	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	...	25.450	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115
565	926682	M	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	...	23.690	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637
566	926954	M	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	...	18.980	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820
567	927241	M	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	...	25.740	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400
568	92751	B	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	...	9.456	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039

	feature_nm	importances
14	enginesize	0.784734
11	curbweight	0.057258
22	highwaympg	0.048720
9	carwidth	0.022393
19	horsepower	0.013139
7	wheelbase	0.011686
15	fuelsystem	0.010524
20	peakrpm	0.007255
16	boreratio	0.006584
21	citympg	0.006427
0	CarName	0.005886
10	carheight	0.004461
8	carlength	0.004329
17	stroke	0.004106
18	compressionratio	0.003293
4	carbody	0.003131
13	cylindernumber	0.002893
12	enginetype	0.002050
5	drivewheel	0.000435
2	aspiration	0.000403
3	doornumber	0.000215
1	fueltype	0.000060
6	enginelocation	0.000020

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

	feature_nm	importances
14	enginesize	0.650386
11	curbweight	0.195286
22	highwaympg	0.058165
9	carwidth	0.019784
19	horsepower	0.015464
7	wheelbase	0.011158
0	CarName	0.007592
21	citympg	0.006522
20	peakrpm	0.006101
8	carlength	0.005272
15	fuelsystem	0.004809
18	compressionratio	0.003833
10	carheight	0.003314
16	boreratio	0.002829
17	stroke	0.002414
12	enginetype	0.001963
4	carbody	0.001872
13	cylindernumber	0.001062
3	doornumber	0.000773
2	aspiration	0.000675
5	drivewheel	0.000464
1	fueltype	0.000228
6	enginelocation	0.000035

빅분기6장(앙상블(Ensemble))

6.8 앙상블(Ensemble)¶

앙상블은 단일 결정 트리의 단점 극복을 위해 여러 머신러닝 모델을 연결해 더 강력한 모델을 만드는 방법¶

편향, 잡음 및 분산으로 인한 오류를 막고 과적합을 방지하기 위해 사용 대표적인 앙상블 기법으로 붓스트랩, 배깅, 부스팅, 랜덤포레스트가 있다.¶

1. 부트스트랩¶

랜덤 샘플링의 일종, 단순임의 복원추출법을 적용해 여려 개의 동일한 크기의 표본 자료를 획득하는 방법.¶

주어진 데이터를 원래의 모집단을 대표하는 독립 표본으로 가정하고, 그 자료로부터 중복을 허용한 무작위 재추출을 하여 복수의 자료를 획득하고 각각에서 통계량을 계산함¶

2. 배깅¶

주어진 자료서 여러 개의 부트스트랩 자료를 생성후 각각의 부트스트랩 자료에 예측 모형을 만든후 결합하여 최종 예측모형을 만드는 방식.¶

분류,회귀분석 에서 사용하는 머신러닝 알고리즘의 안정성과 정확도를 향상시키기 위해 고안된 일종의 앙상블 학습법의 알고리즘이다.¶

분산을 줄이고 정확도를 개선하여 모델의 안정성을 크게 높여 과적합을 방지한다.¶

배깅을 이용한 머신러닝은 "부트스트랩 >> 모델링 >> 보팅" 순서로 진행¶

1) 보팅(Voting) : 여러 개의 모형으로 부터 산출된 결과 중 다수결에 의해 최종 결과를 산정하는 과정(의사결정 트리 구축의 가장 어려운 가지치기를 진행하지 않고 약한 학습자인 트리를 최대로 성장시킨 후 보팅을 진행)¶

2) 배깅 특징 : 주어진 자료보다 분산이 적은 앙상블 모델을 얻는 데 중점, 각 부트스트랩에 대해 부트스트래핑 및 모델링 과정이 병렬적으로 수행¶

3) Out Of Bag : 배깅에서 부트스트래핑 수행시 평균적으로 각 예측기에 훈련데이터의 63% 정도만 샘플링됨, 이떄 선택되지 않은 나머지 37%를 Out Of Bag 샘플이라고함. 단, 예측기마다 남겨진 37%의 데이터는 서로 다르다.¶

4) oob_score : 선택되지 않은 37%의 데이터롤 모델의 성능평가를 수행하는 데 사용할 수 있다. 따라서 검증세트나 교차검증이 따로 필요하지 않다.¶

3. 코드실습¶

위스콘신 유방암 진단 데이터세트를 사용해 종양을 예측하는 분류분석을 수행해보자¶

BaggingClassifire 함수에서 oob_score 매개변수를 True로 설정하면 이기능을 사용할수 있음, oob_score를 사용할 경우 검증세트나 교차검증을 하지 않아도 된다는 장¶

ROC곡선을 그리고 AUC값을 계산해 분류 예측 결과를 시각화¶

배깅 방식으로 회귀분석을 구현하는 BaggingRegressor 함수에 대해 알아보자¶

자동차 데이터세트를 사용해 자동차의 가격을 예측해보자¶

부스팅¶

부스팅은 예측력이 약한 모형들을 결합해 강한 예측모형을 만드는 방법¶

부트스트랩을 병렬로 수행하여 각 모델을 독립적으로 구축하는 배깅과 달리, 부스팅은 순차방식으로 학습!!!¶

따라서 배깅에 비해 모델의 장점을 최적화하고 train데이터에 대해 오류가 적은 결합모델을 생성할 수 있다는 장점이 있다, 그러나 train 데이터에 과적합할 위험이 있다.¶

아다부스트(AdaBoost): 각각의 학습 단계에서 이전 모델이 잘못 분류한 샘플에 가중치를 부여하여 다음 모델을 학습시킵니다. 예측이 어려운 샘플에 집중하여 학습하는 특징이 있습니다.¶

그래디언트 부스팅(Gradient Boosting): 손실 함수의 그래디언트를 활용하여 모델을 학습시킵니다. 이전 모델의 오차(residual)를 예측하는 새로운 모델을 학습시키는 방식입니다. 이 과정을 반복하여 모델을 개선해나갑니다.¶

코드실습¶

배깅에서 실습한 유방암 데이터로 부스팅을 수행¶

ROC 커브를 시각화해 auc를 구해볼수 있다.¶

변수중요도를 통해 예측에 사용된 변수들 중 타깃변수에 영향을 가장 많이 준 변수가 무엇인지 확인!!!!¶

부스트 방식으로 회귀분석을 구현하는 AdaBoostRegressor 함수에 대해 알아보자¶

배깅에서 사용한 자동차 가격 데이터로 부스팅 수행¶

변수중요도롤 통해 예측에 사용된 변수들 중 타겟변수에 영향을 가장 많이 준 변수가 무엇인지 확인해보자¶

랜덤포레스트¶

배깅과 부스팅보다 더 많은 무작위성을 주어 약한 학습기들을 생성후 이를 선형결합해 최종 학습기를 만드는 방법!!!!¶

코드 실습¶

랜덤포레스트 방식으로 분류분석을 구현하는 RandomForestClassifier 함수에 대해 알아보자¶

유방암 데이터로 실습해보자¶

ROC커브를 시각화해 AUC를 구해볼 수 있다¶

이번에는 변수중요도를 통해 예측에 사용된 변수들 중 타깃변수에 영향을 가장 많이 준 변수 확인¶

랜더포레스트 방식으로 회귀분석 구현하는 RandomForestRegressor 함수에 대해 알아보자¶

자동차 가격 데이터로 코드실습 수행해보자!!!¶

마찬가지로 변수중요도를 통해 예측에 사용된 변수들 중 타깃변수에 영향을 가장 많이 준 변수가 무엇인지 살펴보자¶