#이상치 탐색 실습
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine

#데이터를 불러오기 위해 load_wine() 함수 사용
wine_load = load_wine()

#pd.DataFrame() 함수는 pandas 데이터프레임을 생성하는 함수
#pd.DataFrame(데이터, 컬럼 이름을 나타내는 리스트)
wine = pd.DataFrame(wine_load.data, columns=wine_load.feature_names)
wine['Class'] = wine_load.target
wine['Class'] = wine['Class'].map({0:'class_0', 1:'class_1', 2:'class_2'})
wine


#whis파라미터는 이상치를 결정하는 규칙의 범위를 지정하는 값입니다.
plt.boxplot(wine['color_intensity'], whis=1.5)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


import numpy as np
def outliers_iqr(date, columns):
    quartile_1, quartile_3 = np.percentile(date[columns], [25,75])
    iqr = quartile_3 - quartile_1
    lower_whis = quartile_1 - (iqr*1.5)
    upper_whis = quartile_3 + (iqr*1.5)
    outliers = date[(date[columns] > upper_whis) | (date[columns] < lower_whis)]
    return outliers[[columns]]
outliers = outliers_iqr(wine, 'color_intensity')
outliers


drop_outliers = wine.drop(index = outliers.index)

print('원래 데이터:', wine.shape)
print('이상치 제거 데이터:', drop_outliers.shape)

#4개의 행이 삭제된거 확인

원래 데이터: (178, 14)
이상치 제거 데이터: (174, 14)


wine.loc[outliers.index, 'color_intensity'] = np.NaN

#널값 개수 확인
print(wine['color_intensity'].isnull().sum()) 

#결측치 평균으로 변경
wine['color_intensity'].fillna(wine['color_intensity'].mean(), inplace=True)
wine.loc[outliers.index, 'color_intensity']

#널값 개수 확인
print(wine['color_intensity'].isnull().sum())

4
0


#데이터 불러오기
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
iris = pd.DataFrame(iris.data, columns = iris.feature_names)
iris['Class'] = load_iris().target
iris['Class'] = iris['Class'].map({0:'Setosa', 1:'Versicolour', 2:'Virginica'})

#더미변수로 대체
iris_dummy = pd.get_dummies(iris, columns = ['Class'])
iris_dummy


#데이터 불러오기
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
iris = pd.DataFrame(iris.data, columns = iris.feature_names)
iris['Class'] = load_iris().target
iris['Class'] = iris['Class'].map({0:'Setosa', 1:'Versicolour', 2:'Virginica'})

#데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.drop(columns='Class'), iris['Class'], test_size=0.2, random_state=1004)
print('X_train:',X_train.shape, 'X_test:',X_test.shape)
print('y_train:',y_train.shape, 'y_test:',y_test.shape)

X_train: (120, 4) X_test: (30, 4)
y_train: (120,) y_test: (30,)


X_train.head(4)


print(y_train.head(4))
print('----------------------------')
print(y_train.value_counts())

87     Versicolour
67     Versicolour
131      Virginica
74     Versicolour
Name: Class, dtype: object
----------------------------
Versicolour    41
Setosa         40
Virginica      39
Name: Class, dtype: int64


#층화임의추출을 수행
X_train, X_Test, y_train, y_test = train_test_split(iris.drop(columns='Class'), iris['Class'], test_size=0.2, random_state=1004, stratify = iris['Class'])
print('X_train:',X_train.shape, 'X_test:',X_test.shape)
print('y_train:',y_train.shape, 'y_test:',y_test.shape)

X_train: (120, 4) X_test: (30, 4)
y_train: (120,) y_test: (30,)


print(y_train.value_counts())

Versicolour    40
Virginica      40
Setosa         40
Name: Class, dtype: int64


#Standard Scaler : 표준화 방식 컬럼들을 평균이 0, 분산이 1인 정규 분포로 스케일링
#크기를 제한하지 않아 이상치에 매우 민감, 회귀보다는 분류분석에서 유용
from sklearn.preprocessing import StandardScaler
StdScaler = StandardScaler()

#Train 데이터의 분포 저장과 스케일링
StdScaler.fit(X_train)
X_train_sc = StdScaler.transform(X_train)

#test데이터의 스케일링
X_test_sc = StdScaler.transform(X_test)

print('X_train scaler:', X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std())
print('X_test_sc scaler:', X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std())

X_train scaler: -2.37157895896038 3.0448741749063144 7.031412489292658e-16 1.0000000000000002
X_test_sc scaler: -1.764170396783293 2.2416293573806088 0.05768124101010797 1.0182220940432938


#MIN-max Scaler : 정규화 방식으로 컬럼들을 0과 1사이의 값으로 스케일링 하는 방식
#최솟값 0, 최댓값 1이고 이상값에 매우민감하며 분류보단 회귀에 유용한 방식
from sklearn.preprocessing import MinMaxScaler
MMScaler = MinMaxScaler()

#Train 데이터의 분포 저장과 스케일링
MMScaler.fit(X_train)
X_train_sc = MMScaler.transform(X_train)

#test 데이터의 스케일링
X_test_sc = MMScaler.transform(X_test)

print('X_train Scaler:', X_train_sc.min() , X_train_sc.max(), X_train_sc.mean() , X_train_sc.std())
print('X_test Scaler:', X_test_sc.min() , X_test_sc.max(), X_test_sc.mean() , X_test_sc.std())

X_train Scaler: 0.0 1.0 0.45576170793176024 0.26651168193639474
X_test Scaler: 0.02941176470588247 0.9999999999999998 0.47087487537387834 0.2761214199407616


#Max Abs Scaler : 정규화 방식으로 최대절댓값과 각각 1,0 이 되도록 스케일링 하는 방식
#데이터가 양수인경우 MinMaxScaler와 동일, 이상치에 민감 분류보단 회귀분석에 유용
from sklearn.preprocessing import MaxAbsScaler
MaScaler = MaxAbsScaler()

#train 데이터의 분포 저장과 스케일링
MaScaler.fit(X_train)
X_train_sc = MaScaler.transform(X_train)

#test 데이터의 스케일링
X_test_sc = MaScaler.transform(X_test)

print('X_train Scaler:', X_train_sc.min() , X_train_sc.max(), X_train_sc.mean() , X_train_sc.std())
print('X_test Scaler:', X_test_sc.min() , X_test_sc.max(), X_test_sc.mean() , X_test_sc.std())

X_train Scaler: 0.04 1.0 0.6200624215760084 0.2399279995496501
X_test Scaler: 0.08 1.0 0.6318743177112742 0.24331008806723997


#Robust Scaler : 평균과 분산 대신 중앙값과 사분위 값을 활용하는 방식
#중앙값을 0으로 설정, IQR을 사용해 이상치의 영향을 최소화
#quantile_range 파라미터(디폴트[0.25,0.75])를 조정해 이상치 정제가능
from sklearn.preprocessing import RobustScaler
RuScaler = RobustScaler()

#train 데이터 분포 저장과 스케일링
RuScaler.fit(X_train)
X_train_sc = RuScaler.transform(X_train)

#test 데이터의 스케일링
X_test_sc = RuScaler.transform(X_test)

print('X_train Scaler:', X_train_sc.min() , X_train_sc.max(), X_train_sc.mean() , X_train_sc.std())
print('X_test Scaler:', X_test_sc.min() , X_test_sc.max(), X_test_sc.mean() , X_test_sc.std())

X_train Scaler: -1.904761904761905 2.666666666666668 -0.023394383394383343 0.645740495068888
X_test Scaler: -1.333333333333334 1.7142857142857144 0.012918192918192946 0.6389035146009887


#scaler.inverse_transform() 을 사용해 원본 스케일로 변경

#스케일링된 데이터 확인
pd.DataFrame(X_train_sc).head(3)


#원본 스케일로 변경
X_original = RuScaler.inverse_transform(X_train_sc)
pd.DataFrame(X_original).head(3)


import pandas as pd
from sklearn.datasets import load_iris

#데이터 불러오기
iris = load_iris()
iris = pd.DataFrame(iris.data, columns = iris.feature_names)
iris['Class'] = load_iris().target
iris['Class'] = iris['Class'].map({0:'Setosa', 1:'Versicolour', 2:'Virginica'})

#수치형 데이터 스케일링
x = iris.drop(columns='Class')
from sklearn.preprocessing import RobustScaler
x = RobustScaler().fit_transform(x)

pd.DataFrame(x).head()


from sklearn.decomposition import PCA
pca = PCA(n_components = 4) #생설할 주성분의 개수
pca_fit = pca.fit(x)

print('고유 값:', pca.singular_values_)
print('분산 설명력:', pca.explained_variance_ratio_)

고유 값: [12.44864825  9.32285783  2.71310669  0.90362564]
분산 설명력: [0.61972166 0.3475765  0.02943649 0.00326535]


import matplotlib.pyplot as plt

plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.plot(pca.explained_variance_ratio_, 'o-')
plt.show()


#PCA 객체 생성(주성분 개수 2개 생성)
pca = PCA(n_components =2)

#2개의 주성분을 가진 데이터로 변환
principalComponents = pca.fit_transform(x)
principas_iris = pd.DataFrame(data = principalComponents, columns = ['pc1','pc2'])
principas_iris


import matplotlib.pyplot as plt
import seaborn as sns

plt.title('2 component PCA')
sns.scatterplot(x='pc1', y='pc2', hue=iris.Class, data=principas_iris)
plt.show()


from sklearn.datasets import fetch_openml
import pandas as pd

boston = fetch_openml(name='boston', version=1) #보스턴 주택가격 로드하기
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df['PRICE'] = boston.target
df.head() #앞의 5개 데이터 확인

C:\Users\82108\anaconda3\lib\site-packages\sklearn\datasets\_openml.py:968: FutureWarning: The default value of `parser` will change from `'liac-arff'` to `'auto'` in 1.4. You can set `parser='auto'` to silence this warning. Therefore, an `ImportError` will be raised from 1.4 if the dataset is dense and pandas is not installed. Note that the pandas parser may return different data types. See the Notes Section in fetch_openml's API doc for details.
  warn(


df['CHAS'] = df['CHAS'].astype('float')
df['RAD'] = df['RAD'].astype('float')
print('데이터의 형태:', df.shape)
df.info() #모든 변수의 타입확인
df.isnull().sum() #데이터에 결측치가 있는지 파악

데이터의 형태: (506, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
PRICE      0
dtype: int64


df.corr() #변수간 상관관계확인


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns='PRICE'), df['PRICE'], 
                                                    test_size=0.2, random_state=42)

#분할된 데이터의 종속변수의 평균을 확인해보자
print('학습데이터 세트 price평균 :', y_train.mean())
print('평가데이터 세트 price평균 :', y_test.mean())

학습데이터 세트 price평균 : 22.79653465346535
평가데이터 세트 price평균 : 21.488235294117654


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_scaler = scaler.fit_transform(x_train) #학습데이터 정규화
x_test_scaler = scaler.fit_transform(x_test) #평가데이터 정규화

print(x_train_scaler)
print(x_test_scaler)

[[1.68762759e-01 0.00000000e+00 6.42962963e-01 ... 8.08510638e-01
  8.80427656e-01 6.39624724e-01]
 [6.95009416e-03 0.00000000e+00 2.74074074e-01 ... 8.93617021e-01
  9.96772404e-01 1.85982340e-01]
 [2.87746689e-04 3.50000000e-01 1.97037037e-01 ... 4.57446809e-01
  9.12627969e-01 1.68322296e-01]
 ...
 [6.68786251e-05 8.00000000e-01 4.70370370e-02 ... 4.68085106e-01
  9.84971506e-01 1.17549669e-01]
 [1.25342233e-01 0.00000000e+00 6.42962963e-01 ... 8.08510638e-01
  2.76186394e-01 5.94370861e-01]
 [2.46945108e-03 0.00000000e+00 2.89629630e-01 ... 8.82978723e-01
  1.77719502e-01 2.45584989e-01]]
[[1.86839552e-03 0.00000000e+00 1.31598240e-01 ... 4.00000000e-01
  9.96412280e-01 1.80645161e-01]
 [1.09576391e-03 4.21052632e-01 2.18108504e-01 ... 5.11111111e-01
  1.00000000e+00 1.90615836e-02]
 [2.17360031e-03 0.00000000e+00 1.00000000e+00 ... 7.88888889e-01
  9.82599559e-01 4.45454545e-01]
 ...
 [3.07049575e-01 0.00000000e+00 6.46627566e-01 ... 8.00000000e-01
  7.27025780e-02 5.37829912e-01]
 [9.96069504e-04 0.00000000e+00 1.47727273e-01 ... 6.11111111e-01
  9.97667982e-01 2.92668622e-01]
 [1.94141730e-03 0.00000000e+00 4.53445748e-01 ... 6.33333333e-01
  9.64968479e-01 1.77712610e-01]]


from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_train_scaler, y_train) #선형 회귀모델을 학습하는 코드

pred_train = linear.predict(x_train_scaler) #학습된 모델을 사용해 예측값 계산해 저장


#평가지표 활용 성능평가
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

mae = mean_absolute_error(y_train, pred_train)
mse = mean_squared_error(y_train, pred_train)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, pred_train)

print('MAE: {0: .5f}'.format(mae))#평균절대값오차
print('MSE: {0: .5f}'.format(mse))#평균제곱오차
print('RMSE: {0: .5f}'.format(rmse))#회귀모델 성능평가지표중 하나(작을수록 예측 정확)
print('R2: {0: .5f}'.format(r2)) #모델이 예측하는 값이 실제값과 얼마나 유사한지
#0~1사이 이며 1에 가까울수록 모델이 더 좋은 예측을 한다는 것을 의미

MAE:  3.31477
MSE:  21.64141
RMSE:  4.65203
R2:  0.75089


pred = linear.predict(x_test_scaler) #학습된 모델을 사용해 예측값 계산후 저장

#평가지표 활용 성능평가
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print('MAE: {0: .5f}'.format(mae))#평균절대값오차
print('MSE: {0: .5f}'.format(mse))#평균제곱오차
print('RMSE: {0: .5f}'.format(rmse))#회귀모델 성능평가지표중 하나(작을수록 예측 정확)
print('R2: {0: .5f}'.format(r2)) #모델이 예측하는 값이 실제값과 얼마나 유사한지
#0~1사이 이며 1에 가까울수록 모델이 더 좋은 예측을 한다는 것을 의미

MAE:  4.25829
MSE:  31.55305
RMSE:  5.61721
R2:  0.56973


#실제 값과 예측 결과를 비교하도록 데이터 프레임으로 저정해보자
pred_df = pd.DataFrame(pred, columns=['pred_Price'])
pred_df.head()


actual = pd.DataFrame(y_test.values, columns=['actual_Price'])
actual.head()


#pd.concat() 으로 pred_df와 actual를 병합하여 'reg_result.csv' 파일로 저장해보기
reg_result = pd.concat([actual, pred_df], axis=1)
#csv로 저장
reg_result.to_csv('reg_result.csv', index=False, encoding='utf-8-sig')


from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['Species'] = iris.target

print(df['Species'].unique()) #0:Setosa, 1:Versicolor, 2:Virginica 의미
print(df.shape)
print(df.info())
df.head()

[0 1 2]
(150, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Species            150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB
None


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns='Species'), df['Species'],
                                                   test_size = 0.2, random_state=0,
                                                    stratify = df['Species'])


#결측치 존재유무 확인
df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
Species              0
dtype: int64


from sklearn.tree import DecisionTreeClassifier


#max_depth, random_state는 하이퍼파라미터
#max_depth : 의사결정나무의 최대깊이를 지정
#random_state : 무작위로 분할할 떄 사용되는 시드값
dtree_clf_5 = DecisionTreeClassifier(max_depth=5, random_state=100)
dtree_clf_3 = DecisionTreeClassifier(max_depth=3, random_state=100)
dtree_clf_1 = DecisionTreeClassifier(max_depth=1, random_state=100)


#교차검증
from sklearn.model_selection import cross_val_score
import numpy as np

#max_depth = 5 일떄
score = cross_val_score(dtree_clf_5, x_train, y_train, scoring='accuracy', cv=10)
print('depth = 5 일떄 교차검증 정확도:', np.round(score, 3))
print('depth = 5 일떄 교차검증 평균 정확도:', np.round(np.mean(score), 4))
print("======================================================================")

#max_depth = 3 일떄
score = cross_val_score(dtree_clf_3, x_train, y_train, scoring='accuracy', cv=10)
print('depth = 3 일떄 교차검증 정확도:', np.round(score, 3))
print('depth = 3 일떄 교차검증 평균 정확도:', np.round(np.mean(score), 4))
print("======================================================================")

#max_depth = 1 일떄
score = cross_val_score(dtree_clf_1, x_train, y_train, scoring='accuracy', cv=10)
print('depth = 1 일떄 교차검증 정확도:', np.round(score, 3))
print('depth = 1 일떄 교차검증 평균 정확도:', np.round(np.mean(score), 4))

depth = 5 일떄 교차검증 정확도: [0.917 1.    0.917 1.    1.    0.833 1.    0.917 1.    0.833]
depth = 5 일떄 교차검증 평균 정확도: 0.9417
======================================================================
depth = 3 일떄 교차검증 정확도: [0.917 1.    0.917 0.917 1.    0.833 1.    0.917 0.917 0.833]
depth = 3 일떄 교차검증 평균 정확도: 0.925
======================================================================
depth = 1 일떄 교차검증 정확도: [0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667]
depth = 1 일떄 교차검증 평균 정확도: 0.6667


dtree_clf_5.fit(x_train, y_train)
pred = dtree_clf_5.predict(x_test)

#알고리즘 성능평가를 위해 분류분석 예측 정확도를 측정하는 것을 import한다
from sklearn.metrics import accuracy_score
print('의사결정나무(교차검증후) 예측 정학도: {0:.5f}'.format(accuracy_score(y_test, pred)))

의사결정나무(교차검증후) 예측 정학도: 0.96667


pred_df = pd.DataFrame(pred, columns=['pred Species'])
pred_df.head()


actual_df = pd.DataFrame(y_test.values, columns=['actual Species'])
actual_df.head()


clf_result = pd.concat([actual_df, pred_df], axis=1)
clf_result.to_csv('clf_result.csv', index=False, encoding = 'utf-8-sig')


clf_result.head()

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline	Class
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0	class_0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0	class_0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0	class_0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0	class_0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0	class_0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	13.71	5.65	2.45	20.5	95.0	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740.0	class_2
174	13.40	3.91	2.48	23.0	102.0	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750.0	class_2
175	13.27	4.28	2.26	20.0	120.0	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835.0	class_2
176	13.17	2.59	2.37	20.0	120.0	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840.0	class_2
177	14.13	4.10	2.74	24.5	96.0	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560.0	class_2

	color_intensity
151	10.80
158	13.00
159	11.75
166	10.68

	0	1	2	3
0	0.846154	0.190476	0.157143	0.133333
1	-0.153846	-0.380952	0.157143	0.466667
2	-0.076923	0.000000	-0.042857	-0.066667

	0	1	2	3
0	-0.538462	1.0	-0.842857	-0.733333
1	-0.692308	0.0	-0.842857	-0.733333
2	-0.846154	0.4	-0.871429	-0.733333
3	-0.923077	0.2	-0.814286	-0.733333
4	-0.615385	1.2	-0.842857	-0.733333

	pc1	pc2
0	1.413744	-0.024325
1	0.831382	-0.848814
2	1.172348	-0.645958
3	1.050723	-0.821647
4	1.577837	0.080808
...	...	...
145	-0.862755	0.593353
146	-1.237964	-0.406031
147	-0.709554	0.451953
148	-0.197931	0.986021
149	-0.435905	0.151063

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	Class_Setosa	Class_Versicolour	Class_Virginica
0	5.1	3.5	1.4	0.2	1	0	0
1	4.9	3.0	1.4	0.2	1	0	0
2	4.7	3.2	1.3	0.2	1	0	0
3	4.6	3.1	1.5	0.2	1	0	0
4	5.0	3.6	1.4	0.2	1	0	0
...	...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	0	0	1
146	6.3	2.5	5.0	1.9	0	0	1
147	6.5	3.0	5.2	2.0	0	0	1
148	6.2	3.4	5.4	2.3	0	0	1
149	5.9	3.0	5.1	1.8	0	0	1

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222.0	18.7	396.90	5.33	36.2

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
CRIM	1.000000	-0.200469	0.406583	-0.055892	0.420972	-0.219247	0.352734	-0.379670	0.625505	0.582764	0.289946	-0.385064	0.455621	-0.388305
ZN	-0.200469	1.000000	-0.533828	-0.042697	-0.516604	0.311991	-0.569537	0.664408	-0.311948	-0.314563	-0.391679	0.175520	-0.412995	0.360445
INDUS	0.406583	-0.533828	1.000000	0.062938	0.763651	-0.391676	0.644779	-0.708027	0.595129	0.720760	0.383248	-0.356977	0.603800	-0.483725
CHAS	-0.055892	-0.042697	0.062938	1.000000	0.091203	0.091251	0.086518	-0.099176	-0.007368	-0.035587	-0.121515	0.048788	-0.053929	0.175260
NOX	0.420972	-0.516604	0.763651	0.091203	1.000000	-0.302188	0.731470	-0.769230	0.611441	0.668023	0.188933	-0.380051	0.590879	-0.427321
RM	-0.219247	0.311991	-0.391676	0.091251	-0.302188	1.000000	-0.240265	0.205246	-0.209847	-0.292048	-0.355501	0.128069	-0.613808	0.695360
AGE	0.352734	-0.569537	0.644779	0.086518	0.731470	-0.240265	1.000000	-0.747881	0.456022	0.506456	0.261515	-0.273534	0.602339	-0.376955
DIS	-0.379670	0.664408	-0.708027	-0.099176	-0.769230	0.205246	-0.747881	1.000000	-0.494588	-0.534432	-0.232471	0.291512	-0.496996	0.249929
RAD	0.625505	-0.311948	0.595129	-0.007368	0.611441	-0.209847	0.456022	-0.494588	1.000000	0.910228	0.464741	-0.444413	0.488676	-0.381626
TAX	0.582764	-0.314563	0.720760	-0.035587	0.668023	-0.292048	0.506456	-0.534432	0.910228	1.000000	0.460853	-0.441808	0.543993	-0.468536
PTRATIO	0.289946	-0.391679	0.383248	-0.121515	0.188933	-0.355501	0.261515	-0.232471	0.464741	0.460853	1.000000	-0.177383	0.374044	-0.507787
B	-0.385064	0.175520	-0.356977	0.048788	-0.380051	0.128069	-0.273534	0.291512	-0.444413	-0.441808	-0.177383	1.000000	-0.366087	0.333461
LSTAT	0.455621	-0.412995	0.603800	-0.053929	0.590879	-0.613808	0.602339	-0.496996	0.488676	0.543993	0.374044	-0.366087	1.000000	-0.737663
PRICE	-0.388305	0.360445	-0.483725	0.175260	-0.427321	0.695360	-0.376955	0.249929	-0.381626	-0.468536	-0.507787	0.333461	-0.737663	1.000000

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	pred_Price
0	32.036849
1	39.129879
2	17.358595
3	27.378004
4	20.807614

	actual_Price
0	23.6
1	32.4
2	13.6
3	22.8
4	16.1

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	Class_Setosa	Class_Versicolour	Class_Virginica
0	5.1	3.5	1.4	0.2	1	0	0
1	4.9	3.0	1.4	0.2	1	0	0
2	4.7	3.2	1.3	0.2	1	0	0
3	4.6	3.1	1.5	0.2	1	0	0
4	5.0	3.6	1.4	0.2	1	0	0
...	...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	0	0	1
146	6.3	2.5	5.0	1.9	0	0	1
147	6.5	3.0	5.2	2.0	0	0	1
148	6.2	3.4	5.4	2.3	0	0	1
149	5.9	3.0	5.1	1.8	0	0	1

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

빅데이터분석기사 5장

4장 데이터 전처리¶

데이터 전처리 : 수행하고자 하는 분석에 적합하게 데이터를 가공하는 작업¶

데이터 클리닝 : 결측치,이상치 처리¶

데이터 통합 : 다양한 데이터 파일의 결합¶

데이터 변환 : 스케일링, 요약¶

데이터 축소 : 변수축소, 라벨링¶

불균형 데이터 처리 : 언더 샘플링, 오버 샘플링¶

데이터 분할 : train, test 데이터 분할¶

4-1 이상치 확인 : 일반적으로 IQR 방식을 이상치 판단 기준으로 사용¶

from sklearn.datasets import load_wine : sklearn패케지에서 제공하는 와인 데이터셋¶

데이터를 불러오기 위해 load_wine() 함수 사용¶

이상치를 가져오는 함수를 만들어 이상치의 위치와 값을 확인하는 코드¶

4-2 이상치 정제¶

이상치를 가진 행을 삭제하는 방법과 이상치를 적절한 값으로 정제하는 것이 있다.¶

이상치 제거¶

이상치 대체¶

4-3 범주형 변수 처리¶

부류나 범위, 서열 등으로 구분하여 수집된 변수¶

더미변수 : 범주형 변수에 있는 범주 각각을 컬럼으로 변경하고, 원본 컬럼의 값이 해당 범주에 속하는지 여부에 따라 1혹은 0으로 채운 변수¶

pd.get_dummies(data, columns = ['범주형1', '범주형2'])¶

iris 데이터의 Class 컬럼을 활용해 범주형 변수를 더미변수 형태로 대체하는 것을 연습해보자¶

4-4 데이터 분할¶

모델 학습 및 성과를 확인하기 위해서 데이터를 Train, Test 세트로 나누고 독립변수와 종속변수를 분리해야 한다.¶

데이터 분할은 scikit-learn의 train_test_split() 함수를 사용하여 수행¶

arrays: 데이터셋을 나타내는 배열이나 데이터프레임들. 이 중 X는 독립 변수(피처), y는 종속 변수(타겟)을 나타냅니다.¶

test_size: 테스트셋의 비율을 나타내는 부동 소수점(float) 값이나 정수 값입니다. (기본값은 0.25)¶

train_size: 학습셋의 비율을 나타내는 부동 소수점(float) 값이나 정수 값입니다. (기본값은 test_size의 나머지)¶

random_state: 데이터셋을 무작위로 나눌 때 사용되는 시드(seed) 값입니다. (기본값은 None)¶

shuffle: 데이터셋을 나누기 전에 셔플(무작위 섞기)할지 여부를 나타내는 불리언(boolean) 값입니다. (기본값은 True)¶

stratify 인자는 층화임의추출 여부를 결정한다(범수형 변수의 범주들의 개수 차이가 크게 날 떄 사용!!!)¶

4-5 데이터 스케일링¶

분석 알고리즘은 컬럼 간 데이터의 범위가 크게 차이날 경우 잘 동작하지 않는다.¶

데이터 스케일링 방법¶

주의점 train, test 데이터를 같은 scaler 객체로 스케일링해야 한다는 것이다.¶

스케일링에는 표준화와 정규화가 있다.¶

표준화 : 각 컬럼의 평균을 0, 분산을 1인 정규 분포로 만드는 방법¶

정규화 : 각 컬럼들의 값이 특정 범위(주로 0~1) 안에 들어가도록 스케일링 하는 방법¶

4-6. 차원 축소¶

1)설명변수 선택 : 차원축소 가장 간단한 방법으로, 유용하지 않거나 상관관계가 높은 컬럼은 제거(장점은 : 해석이 용이하고 수행 과정이 간단함, 단점은 : 설명변수 간의 고차원적인 상관관계는 고려하기 어렵다는 단점)¶

2)주성분 분석(PCA) : 기조니의 컬럼을 새롭게 해석해 저차원의 초평면에 투영하는 것, 데이터를 충분히 설명할 수 있는 몇개의 주성분으로 압축한다.)¶

주성분 분석의 과정¶

1) PCA를 위한 전처리 : PCA수행전 변수 간 스케일의 차이가 주성분 선정에 영향을 주는 것을 방지하기 위해 이상치를 제거하고 스케일링을 수행한다¶

2) 주성분 추출 : scikit-learn의 PCA를 사용하여 주성분을 추출한다¶

singular_values_는 전체 데이터에서 해당 모델로 설명할 수 있는 분산의 비율¶

explanined_varianceratio 는 전체 데이터에서 각 주성분이 설명할 수 있는 분산의 비율을 의미¶

3) Scree Plot으로 사용할 주성분의 개수 정하기 : 주성분 개수가 증가할수록 Scree Plot의 기울기는 감소한다. 보통 플롯의 기울기가 급격히 감소하는 지점의 직전까지를 주성분으로 선택!!!¶

4) 새로운 데이터프레임 확인 : 주성분의 수를 정하고 2)를 다시 수행해 주성분 객체를 생성하고, fit_transform()을 수행해 원하는 개수의 주성분을 가진 데이터 프레임을 만든다.¶

5) 주성분 산포도 확인 : 주성분 데이터 프레임의 산포도를 다시 확인하면 원본 데이터프레임으로 그린 산포도 보다 종속변수를 더 잘 설명하는 산포도를 확인할 수있따.¶

4-7. 데이터 불균형 문제 처리¶

모델은 소수의 데이터인 Target의 중요도를 낮게 판단하므로 궁극적으로 분석가가 원하는 모델을 만들 수 없을 것이다.¶

오버샘플링 : 소수의 비정상 데이터의 수를 늘림¶

언더 샘플링 : 상대적으로 많은 정상 데이터에서 일부만 사용¶

1) 언더 샘플링 : 데이터의 수가 극격하게 줄어들오 학습 성능을 떨어뜨리는 결과 초래가능 ㅠㅠ¶

5장. 머신러닝 프로세스¶

step1 : 데이터 확인(독립변수, 종속변수 확인 >> 적용가능 분석모델 확인)¶

step2 : 데이터 분할(학습60-80, 검증10-20, 평가10-20)¶

step3 : 전처리(결측치와 이상치 처리, 정규화 및 표준화)¶

step4 : 모델학습(회귀,분류,비지도,하이퍼파라미터 조절)¶

step5 : 성능평가(분석 정확도 확인/ 알고리즘 성능 제시)¶

5-1 성능평가 기법 : 머신러닝 모델은 여러가지 방법으로 예측성능플 평가할 수 있다. 이를 성능평가지표라함¶

1) 회귀분석 성능 평가지표¶

MAE : 실제값과 예측값의 차이를 절댓값으로 변환해 평균한것¶

MSE : 실제값과 예측값의 차이를 제곱해 평균한 것¶

RMSE : 실제값과 예측값의 차이를 제곱해 평균한 것에 루트를 씌운 것¶

MSLE : 실제값과 예측값의 차이를 제곱해 평균한 것에 로그를 적용한 것¶

MAPE : MAE를 퍼센트로 변환한 것¶

2) 분류분석 성능 평가지표¶

정확도 : 실제 데이터에서 예측 데이터가 얼마나 같은지 판단하는 지표¶

혼동행렬 : 정확도의 한계점을 보완하기 위해 혼동행렬을 활용¶

정밀도, 재현율 : 데이터 예측에 집중한 성능평가지표¶

정밀도 : positive로 예측한 것중 실제도로 psitive인 것들의 비율¶

재현율 : 실제 positive인 것중 positive로 예측한 것들의 비율¶

정밀도와 재현율의 상충관계(트레이드오프) : 분류 결정 임곗값을 조장해 정밀도, 재현율 수치를 조절가능, but 이둘은 상충관계인 성능평가지표이기에 한쪽을 강제로 높이면 다른 하나의 수치가 떨어짐.¶

두 평가지표의 수치가 적절한 조화를 이루어 종합적으로 분류모델의 성능을 평가해야함¶

F1스코어 : 정밀도와 재현율을 결합한 분류 성능지표¶

ROC곡선과 AUC스코어 : 이진 분류모델의 주요 성능평가지표¶

5-2 머신러닝 분석 빠르게 맛보기 - 회귀분석¶

1) 데이터 확인(데이터 가져오기)¶

2) 데이터 분할¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	Class_Setosa	Class_Versicolour	Class_Virginica
0	5.1	3.5	1.4	0.2	1	0	0
1	4.9	3.0	1.4	0.2	1	0	0
2	4.7	3.2	1.3	0.2	1	0	0
3	4.6	3.1	1.5	0.2	1	0	0
4	5.0	3.6	1.4	0.2	1	0	0
...	...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	0	0	1
146	6.3	2.5	5.0	1.9	0	0	1
147	6.5	3.0	5.2	2.0	0	0	1
148	6.2	3.4	5.4	2.3	0	0	1
149	5.9	3.0	5.1	1.8	0	0	1

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2