#코드 구현
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('http://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/insurance.csv')
df


x=df['age']
y=df['charges']

plt.figure(figsize=(10,5))
plt.scatter(x,y)
plt.xlabel('X')
plt.ylabel('Y')
plt.show()


from sklearn.linear_model import LinearRegression
x = np.array(x)
y = np.array(y)
x = x.reshape(1338 ,1) #차원 맞추기
y = y.reshape(1338 ,1) #차원 맞추기
lr = LinearRegression()

lr.fit(x,y)

LinearRegression()

LinearRegression()


print('절편:', lr.intercept_, '계수:', lr.coef_)
print(lr.score(x,y))#결정계수

절편: [3165.88500606] 계수: [[257.72261867]]
0.08940589967885804


x_new = [[19],[64]]
y_hat = lr.predict(x_new)
print(y_hat)

[[ 8062.61476073]
 [19660.13260074]]


plt.figure(figsize=(10,5))
plt.plot(x_new, y_hat, '-r')
plt.plot(x, y, 'b.')
plt.xlabel('X')
plt.xlabel('Y')
plt.show()


import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor

data = pd.read_csv('http://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/insurance.csv')
x = np.array(data['age'])
y = np.array(data['charges'])
x = x.reshape(1338, 1)
y = y.reshape(1338, 1)

#SGDRegressor은 확률적 경사하강법 회귀모델
#max_iter 파라미터는 모델 학습에 사요오디는 최대 반복 횟수를 지정
#random_state 파라미터는 모델 학습 시에 난수 생성기의 시드를 지정
sgd_reg = SGDRegressor()
sgd_reg.fit(x,y.ravel())

SGDRegressor()

SGDRegressor()


print('절편:', sgd_reg.intercept_, '계수:', sgd_reg.coef_)

절편: [9669.23904679] 계수: [266.25007967]


#비용을 예측해보자
x_new=[[19],[64]]
y_hat = sgd_reg.predict(x_new)
print(y_hat)

[14727.99056051 26709.24414562]


plt.figure(figsize=(10,5))
plt.plot(x_new, y_hat, '-r')
plt.plot(x, y, 'b.')
plt.xlabel('X')
plt.xlabel('Y')
plt.show()


import pandas as pd
cereal = pd.read_csv('http://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/cereal.csv')
cereal.info()
cereal.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      77 non-null     object 
 1   mfr       77 non-null     object 
 2   type      77 non-null     object 
 3   calories  77 non-null     int64  
 4   protein   77 non-null     int64  
 5   fat       77 non-null     int64  
 6   sodium    77 non-null     int64  
 7   fiber     77 non-null     float64
 8   carbo     77 non-null     float64
 9   sugars    77 non-null     int64  
 10  potass    77 non-null     int64  
 11  vitamins  77 non-null     int64  
 12  shelf     77 non-null     int64  
 13  weight    77 non-null     float64
 14  cups      77 non-null     float64
 15  rating    77 non-null     float64
dtypes: float64(5), int64(8), object(3)
memory usage: 9.8+ KB

(77, 16)


cereal = cereal[cereal.columns[3:]]
cereal = cereal.loc[cereal['sugars'] >= 0]
cereal


import matplotlib.pyplot as plt
cereal2 = cereal[['sugars','rating']].sort_values(by='sugars', ascending=True)
cereal2.reset_index(drop=True, inplace=True)
x = cereal2['sugars'].values
y = cereal2['rating'].values
plt.scatter(x,y)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(53,) (23,)
(53,) (23,)


from sklearn.preprocessing import PolynomialFeatures

#2차 다항식 조건을 생성(degree=2)
poly_reg = PolynomialFeatures(degree=2)

#fit_transform() 메서드로 데이터를 생성
X_poly = poly_reg.fit_transform(X_train.reshape(-1,1))

#LinerarRegression 분석
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_poly, y_train)

LinearRegression()

LinearRegression()


import numpy as np
#X_test를 transform메서스들 활용해 변환
X_test_poly = poly_reg.transform(X_test.reshape(-1,1))

#predict 메서드를 적용해 예측값을 pred에 지정
pred = reg.predict(X_test_poly)

#예측값과 실제값 비교
np.set_printoptions(precision=2)
print(np.concatenate((pred.reshape(len(pred),1), y_test.reshape(len(y_test),1)) ,1))

[[51.63 46.66]
 [32.1  28.74]
 [55.79 59.64]
 [31.08 37.84]
 [32.1  31.44]
 [44.46 44.33]
 [38.82 40.4 ]
 [41.45 55.33]
 [41.45 49.12]
 [31.38 27.75]
 [36.56 34.38]
 [34.7  29.92]
 [65.25 63.01]
 [33.21 31.07]
 [44.46 52.08]
 [38.82 40.45]
 [51.63 53.13]
 [36.56 33.98]
 [41.45 49.51]
 [31.04 22.74]
 [31.38 39.26]
 [31.5  31.23]
 [32.1  21.87]]


from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error

mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)

#mse에 루트를 씌운 값, 실제 값과 예측값 간의 차이를 평균적으로 얼마나 벗어나는지
#나타냄
rmse = np.sqrt(mse)

#결정 계수는 모델이 주어진 데이터에 대해 얼마나 잘 적합되는지를 측정하는 지표로, 
#0부터 1까지의 값을 가지며, 
#1에 가까울수록 모델이 데이터에 잘 적합된 것입니다.
acc = reg.score(X_test_poly, y_test)


print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc *100,3)))

MSE	33.565
MAE	4.606
RMSE	5.794
ACC	74.376


import pandas as pd
cereal = pd.read_csv('http://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/cereal.csv')
cereal


import pandas as pd
import numpy as np
cereal = pd.read_csv('http://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/cereal.csv')

#데이터 타입 object 제외, sugars 값이 0보다 큰것
cereal = cereal[cereal.columns[3:]]
cereal = cereal.loc[cereal['sugars'] >= 0]

X = cereal.iloc[:,:-1].values
y = cereal.iloc[:,-1].values

#데이터 분할 학습데이터 평가데이터 7:3
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#스케일링, 다항변수 변환, 모델적합 과정
from sklearn.preprocessing import MinMaxScaler

MMS = MinMaxScaler()
X_train = MMS.fit_transform(X_train)
X_test = MMS.transform(X_test)

from sklearn.preprocessing import PolynomialFeatures
#PolynomialFeatures클래스는 다항식 특징을 생성(2차 다항식[degree=2])
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X_train)
X_test_poly = poly_reg.transform(X_test)

#모델 적합
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_poly, y_train)

#모델 예측
pred = reg.predict(X_test_poly)

#회귀분석에서 사용하는 성능평가 지표로 고차 다항회귀 예측력을 확인!!
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error

mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)

#mse에 루트를 씌운 값, 실제 값과 예측값 간의 차이를 평균적으로 얼마나 벗어나는지
#나타냄
rmse = np.sqrt(mse)

#결정 계수는 모델이 주어진 데이터에 대해 얼마나 잘 적합되는지를 측정하는 지표로, 
#0부터 1까지의 값을 가지며, 
#1에 가까울수록 모델이 데이터에 잘 적합된 것입니다.
acc = reg.score(X_test_poly, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc *100,3)))

(53, 12) (23, 12) (53,) (23,)
MSE	0.796
MAE	0.634
RMSE	0.892
ACC	99.39

	age	sex	bmi	children	smoker	region	charges
0	19	female	27.900	0	yes	southwest	16884.92400
1	18	male	33.770	1	no	southeast	1725.55230
2	28	male	33.000	3	no	southeast	4449.46200
3	33	male	22.705	0	no	northwest	21984.47061
4	32	male	28.880	0	no	northwest	3866.85520
...	...	...	...	...	...	...	...
1333	50	male	30.970	3	no	northwest	10600.54830
1334	18	female	31.920	0	no	northeast	2205.98080
1335	18	female	36.850	0	no	southeast	1629.83350
1336	21	female	25.800	0	no	southwest	2007.94500
1337	61	female	29.070	0	yes	northwest	29141.36030

	calories	protein	fat	sodium	fiber	carbo	sugars	potass	vitamins	shelf	weight	cups	rating
0	70	4	1	130	10.0	5.0	6	280	25	3	1.0	0.33	68.402973
1	120	3	5	15	2.0	8.0	8	135	0	3	1.0	1.00	33.983679
2	70	4	1	260	9.0	7.0	5	320	25	3	1.0	0.33	59.425505
3	50	4	0	140	14.0	8.0	0	330	25	3	1.0	0.50	93.704912
4	110	2	2	200	1.0	14.0	8	-1	25	3	1.0	0.75	34.384843
...	...	...	...	...	...	...	...	...	...	...	...	...	...
72	110	2	1	250	0.0	21.0	3	60	25	3	1.0	0.75	39.106174
73	110	1	1	140	0.0	13.0	12	25	25	2	1.0	1.00	27.753301
74	100	3	1	230	3.0	17.0	3	115	25	1	1.0	0.67	49.787445
75	100	3	1	200	3.0	17.0	3	110	25	1	1.0	1.00	51.592193
76	110	2	1	200	1.0	16.0	8	60	25	1	1.0	0.75	36.187559

	name	mfr	type	calories	protein	fat	sodium	fiber	carbo	sugars	potass	vitamins	shelf	weight	cups	rating
0	100% Bran	N	C	70	4	1	130	10.0	5.0	6	280	25	3	1.0	0.33	68.402973
1	100% Natural Bran	Q	C	120	3	5	15	2.0	8.0	8	135	0	3	1.0	1.00	33.983679
2	All-Bran	K	C	70	4	1	260	9.0	7.0	5	320	25	3	1.0	0.33	59.425505
3	All-Bran with Extra Fiber	K	C	50	4	0	140	14.0	8.0	0	330	25	3	1.0	0.50	93.704912
4	Almond Delight	R	C	110	2	2	200	1.0	14.0	8	-1	25	3	1.0	0.75	34.384843
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
72	Triples	G	C	110	2	1	250	0.0	21.0	3	60	25	3	1.0	0.75	39.106174
73	Trix	G	C	110	1	1	140	0.0	13.0	12	25	25	2	1.0	1.00	27.753301
74	Wheat Chex	R	C	100	3	1	230	3.0	17.0	3	115	25	1	1.0	0.67	49.787445
75	Wheaties	G	C	100	3	1	200	3.0	17.0	3	110	25	1	1.0	1.00	51.592193
76	Wheaties Honey Gold	G	C	110	2	1	200	1.0	16.0	8	60	25	1	1.0	0.75	36.187559

빅분기6장(단순선형회귀, 다항회귀)

6장 머신러닝 - 지도학습¶

6-1. 단순 선형 회귀 : 독립변수가 하나인 경우 데이터의 특징을 가장 잘 설명하는 직선을 학습하는 것¶

선형회귀 모델을 잘 학습시키려면 MSE 값을 최소화 하는 파라미터를 찾아야 한다. 경사하강법을 사용해 문제를 해결¶

age와 charges 사이의 선형 모델을 생성하기 앞서, 두 데이터 사이의 선형성을 확인 ( 우상향 추세를 확인 )¶

LinearRegression 모델 학습¶

선형 회귀 모델 결과¶

새로운 x데이터 19, 64에 대한 예측값 출력¶

산점도 위에 회귀선을 그어 선형 모델이 데이터를 얼마나 설명할 수 있는지 시각화¶

6-2. 경사하강법 : 함수의 값이 낮아지는 방향으로 독립변수의 값을 바꿔가며 최종적으로 최소 함숫값을 갖도록 하는 독립 변수값을 찾는 방식¶

학습률이 너무 작으면 수렴하기까지 반복을 여러 번 수행해야 하므로 시간이 오래 걸린다는 단점, 학습률이 너무 높으면 함수의 값이 발산되는 경향이 있다. 하이퍼파라미터인 학습률을 적정한 크기로 조절해야함¶

sklearn의 linear_model의 여러 함수 중 SGDRegressor은 확률적 경사하강법을 사용한 방식으로 회귀 모델을 구현한다.¶

sklearn을 이용한 SGD회귀 모델 결과¶

6-2. 다항회귀 : 데이터가 단순한 직선 형태가 아닌 비선형의 형태를 갖고 있을 떄, 각 변수의 거듭제곱을 새로운 변수로 추가하면 선형 모델을 사용할 수 있다.¶

단순한 선형회귀 모델에서 독립 변수와 종속 변수 사이의 비선형 관계를 모델링하기 위해 사용하는 회귀분석 기법¶

1) sklearn의 PolynomialFeatures : 다항변수를 생성 하는 함수¶

다항회귀는 다항변수를 생성한 뒤 LinearRegrssion 함수에 적용하면 된다¶

2) 다항회귀 코드 실습¶

2-1) 분석에 필요한 데이터만 추출하기 위해 전처리 진행.¶

데이터타입 object제외, sugars가 0이상인 데이터만 추출¶

2-2) sugars 변수와 rating 변수 사이의 관계를 확인하기 위해 산점도를 그려봄¶

2-3) sugars를 설명변수 rating를 타깃변수로 설정하고, tarin_test_split를 통해 학습데이터와 평가데이터를 7:3의 비율로 분할¶

2-4) PolynomialFeatures로 2차 다항식 조건을 생성한 뒤 fit_transform() 메서드로 X_poly 데이터를 생성, 다항 회귀분석을 위해 LinerarRegression으로 회귀모델을 불러온다, fit() 메서드를 활용해 변환된 데이터를 학습¶

2-5) 다항 회귀 모델을 평가하기 위해 X_test를 transform()메서드를 활용해 변환¶

다항회귀 모델에 predict() 메서드를 적용해 예측값을 pred에 지정¶

np.set_printoptions(precision=2) : 소수점 둘째자리로 맞춤¶

np.concatenate로 예측값, 실제값 합친뒤 출력¶

2-6) 회귀분석에서 사용하는 다양한 성능평가지표를 통해 모델의 예측력을 평가해보자¶

2-7) 고차 다항 회귀분석 코드 실습¶