import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
c = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/classification.csv')
c


sns.pairplot(hue='success', data=c)

<seaborn.axisgrid.PairGrid at 0x1adc3d96400>


from sklearn.model_selection import train_test_split
x=c[['age','interest']]
y=c['success']

train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3, random_state=1, stratify=y)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(207, 2) (90, 2) (207,) (90,)


from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
train_x = std_scaler.fit_transform(train_x)
test_x = std_scaler.transform(test_x)
pd.concat([pd.DataFrame(train_x), train_y.reset_index(drop=True)], axis=1)
sns.pairplot(data=pd.concat([pd.DataFrame(train_x), train_y.reset_index(drop=True)], axis=1),
            hue='success')

<seaborn.axisgrid.PairGrid at 0x1adc3fa96a0>


from sklearn.svm import SVC

#C 매개변수는 SVC 모델의 규제 강도를 조절하는 파라미터입니다. 
#작은 값일수록 규제가 강해지며, 
#큰 값일수록 규제가 약해집니다. C 값이 0.5로 설정되어 있습니다.
clf = SVC(C=0.5, random_state=45)
clf.fit(train_x, train_y)

SVC(C=0.5, random_state=45)

SVC(C=0.5, random_state=45)


pred = clf.predict(test_x)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)

print('정확도\t{}%'.format(round(test_acc*100,2)))
print('정밀도\t{}%'.format(round(test_prc*100,2)))
print('재현율\t{}%'.format(round(test_rcll*100,2)))
print('f1스코어\t{}%'.format(round(test_f1*100,2)))

[[37  2]
 [ 2 49]]
정확도	95.56%
정밀도	96.08%
재현율	96.08%
f1스코어	96.08%


import pandas as pd
import numpy as np

#샘플 데이터 생성하기
x = np.sort(5*np.random.rand(40,1), axis=0)
y = np.sin(x).ravel()
#타겟 데이터에 노이즈 추가
y[::5] += 3*(0.5 - np.random.rand(8))
print(x[:10])
print(y[:10])

[[0.3766309 ]
 [0.43706113]
 [0.45401344]
 [0.59369193]
 [0.61060848]
 [0.71536235]
 [1.11122054]
 [1.18305125]
 [1.38967204]
 [1.46918605]]
[-0.21138522  0.42327868  0.43857592  0.559425    0.57336609  1.45090693
  0.89624075  0.925764    0.98364179  0.99484212]


from sklearn.svm import SVR

#회귀 모델 적합시키기
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_linear = SVR(kernel='linear', C=100, gamma='auto')
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=0.1, coef0=1)


svr_rbf.fit(x,y)
svr_linear.fit(x,y)
svr_poly.fit(x,y)

SVR(C=100, coef0=1, gamma='auto', kernel='poly')

SVR(C=100, coef0=1, gamma='auto', kernel='poly')


rbf_pred = svr_rbf.predict(x)
linear_pred = svr_linear.predict(x)
poly_pred = svr_poly.predict(x)

from sklearn.metrics import mean_squared_error, mean_absolute_error

preds = [rbf_pred, linear_pred, poly_pred]
kernel = ['Random_Forest', 'Linear', 'Polynomial']
evls = ['mse','mae','rmse']

result = pd.DataFrame(index=kernel, columns=evls)

for pred, nm in zip(preds, kernel):
    mse = mean_squared_error(y, pred)
    mae = mean_absolute_error(y, pred)
    rmse =np.sqrt(mse)
    
    result.loc[nm]['mse'] = round(mse,2)
    result.loc[nm]['mae'] = round(mae,2)
    result.loc[nm]['rmse'] = round(rmse,2)
result

	age	interest	success
0	23.657801	18.859917	0.0
1	22.573729	17.969223	0.0
2	32.553424	29.463651	0.0
3	6.718035	25.704665	1.0
4	14.401919	16.770856	0.0
...	...	...	...
292	27.697220	18.799309	0.0
293	15.150959	72.000352	1.0
294	22.264378	68.453459	1.0
295	25.677420	90.118212	1.0
296	21.215594	48.265520	1.0

	mse	mae	rmse
Random_Forest	0.1	0.19	0.31
Linear	0.23	0.34	0.48
Polynomial	0.09	0.18	0.31

빅분기6장(서포트 벡터 머신)

6.5 서포트 벡터 머신(svm)¶

새로운 데이터가 입력되었을 떄 기존 데이터를 활용해 분류하는 방법이다!!¶

패턴인식, 자료분석 등을 위한 지도학습 모델로 회귀와 분류 문제 해결에 사용되는 알고리즘이다.¶

최대 마진 분류기의 단점을 극복하고 확장성을 넓힌 것이 서포트 벡터 분류기 이다.¶

서포트 벡터 분류기를 더 확장하고 비선형 클래스 경계를 수용하기 위해 SVM이 고안되었다.¶

(1) 서포트 벡터 분류기(SVC) : 모든 데이터가 초평면에 의해 두영역으로 분류될 수 있는 것이 아님, 최대 마진 분류기는 모든 데이터에 적용하기 어려운 단점이 있다.¶

서포트 벡터 분류기(SVC)는 최대 마진 분류기를 가지며 일부 관측치들이 마진이나 초평면의 반대쪽에 있는 것을 허용함¶

(2) 서포트 벡터 머신(SVM) : 클래스의 경계가 비선형인 상황에서는 SVC의 개념을 확장하고 커널을 활용해 서포트 벡터 머신을 사용하면 됨¶

커널 : 두 관측치들의 유사성을 수량화하는 함수를 말함¶

예를 들어 2차원에서는 선형 분리가 가능하지 않았던 데이터를 3차원 다항 커널로 변환시 선형분리가 가능한 경우가 있다.¶

(3) 서포트 벡터 회귀(SVR) : SVM의 개념을 활용한 회귀분석 모델이다. 다만, 분류기에서의 목표를 반대로 적용¶

SVM분류기는 일정 마진 오류 안에서 두 클래스 간의 도로 폭이 최대가 되도록 했지만, SVR은 제한된 마진 오류 안에서 도로 안에 가능한 한 많은 데이터 샘플이 들어가도록 학습한다.¶

SVR에서 사용하는 손실 함수중 대표적인것은 epsilon-insensitive함수¶

코드 실습¶

classification 데이터로 LinearSVC 모델을 활용해 분류분석을 구현하는 것을 학습해보자¶

데이터의 클래스 분포를 산점도로 확인¶

데이터 분리(설명변수 : 7, 타깃변수 : 3)¶

서포트 벡터 머신(svm)은 스케일에 민감하다...¶

StandardScaler를 사용해 좀더 예측력 높은 결정경계를 생성할 수 있다.¶

SVC로 모델을 만들고 데이터를 적합¶

SVC의 하이퍼파라미터 C값이 커지면 마진과 옳지 않은 데이터의 허용치가 증가한다¶

모델이 유해짐¶

테스트데이터로 모델 평가를 수행해보자¶

분류이므로 혼동행렬 이용¶

서포트 벡터 회귀(SVR) : 서포트 벡터 방식으로 회귀분석을 구현¶

코드 실습¶

임의의 데이터로 SVR을 사용한 회귀분석을 실행해보자¶

커널 함수를 각각 랜덤포레스트(rbf), 선형회귀(linear), 다항회귀(poly)로 설정한 뒤 SVR모델을 구축해보자¶

predict 메서드를 활용해 예측값을 생성후 여러 성능평가지표를 통해 학습데이터세트를 얼마나 잘 예측했는지 확인 ㄱㄱ¶