빅데이터분석기사 준비
빅분기 2유형 문제연습(회귀-2)
세용용용용
2023. 6. 8. 14:27
대학원 입학가능성 데이터¶
데이터 설명 : 대학원 입학 가능성 예측( 종속변수 : Chance of Admit)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_test.csv¶
In [80]:
# 데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/admission/y_test.csv')
# id, Serial No. 컬럼은 제거해주자
X_train = x_train.drop(columns=['ID', 'Serial No.'])
X_test = x_test.drop(columns=['ID', 'Serial No.'])
display(X_train)
display(y_train.head())
y_train = y_train['Chance of Admit']
y_test = y_test['Chance of Admit']
#print(X_train.info())
#print(X_train.nunique())
#print(X_train.isnull().sum()) # 널 값은 없음
#print(X_test.isnull().sum()) # 널 값은 없음
# Research, SOP, LOR, University Rating 이 네 개의 컬럼은
# 수치형 변수이지만 범주형 데이터 이므로 object로 바꿔주자
for i in ['University Rating', 'SOP', 'LOR', 'Research']:
X_train[i] = X_train[i].astype('object')
X_test[i] = X_test[i].astype('object')
# 수치형 데이터 이상치를 평균으로 대체해주자
def solution(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3 - q1
upper = q3 + (1.5 * iqr)
lower = q1 - (1.5 * iqr)
data[col] = data[col].map(lambda x: data[col].mean() if x > upper or x < lower else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
solution(X_train, i)
solution(X_test, i)
# 데이터 분할 (학습 데이터 0.67, 검증 데이터 0.33)
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
# 수치형 데이터 스케일링 해주자 (StandardScaler)
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
obj_col = X_train.select_dtypes(include='object').columns
sds.fit(X_train.drop(columns=obj_col))
X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.drop(columns=obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns=X_validation.drop(columns=obj_col).columns)
X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.drop(columns=obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
X_test_sc[i] = X_test[i]
# 수치형 데이터 로그 변환을 수행해주자
import numpy as np
for i in X_train_sc.select_dtypes(exclude='object').columns:
if X_train_sc[i].min() < 0:
X_train_sc[i] = X_train_sc[i].map(lambda x: x + abs(X_train_sc[i].min()))
if X_validation_sc[i].min() < 0:
X_validation_sc[i] = X_validation_sc[i].map(lambda x: x + abs(X_validation_sc[i].min()))
if X_test_sc[i].min() < 0:
X_test_sc[i] = X_test_sc[i].map(lambda x: x + abs(X_test_sc[i].min()))
X_train_sc[i] = np.log1p(X_train_sc[i])
X_validation_sc[i] = np.log1p(X_validation_sc[i])
X_test_sc[i] = np.log1p(X_test_sc[i])
# 모델링 하기 전에 object 컬럼 인코딩을 해주자
# 범주의 개수가 많지 않기 때문에 원핫 인코딩을 수행
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
x_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
for i in x_full.select_dtypes(include='object').columns:
labelencoder.fit(x_full[i])
X_train_sc[i] = labelencoder.transform(X_train_sc[i])
X_validation_sc[i] = labelencoder.transform(X_validation_sc[i])
X_test_sc[i] = labelencoder.transform(X_test_sc[i])
# RandomForestRegressor을 사용하여 모델링을 하자
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=40)
RFR.fit(X_train_sc, Y_train)
pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)
# 실제 데이터와 예측한 데이터를 비교해 성능평가를 해보자
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse_train = mean_squared_error(Y_train, pred_train)
mse_validation = mean_squared_error(Y_validation, pred_validation)
mae_train = mean_absolute_error(Y_train, pred_train)
mae_validation = mean_absolute_error(Y_validation, pred_validation)
rmse_train = np.sqrt(mse_train)
rmse_validation = np.sqrt(mse_validation)
r2_score_train = r2_score(Y_train, pred_train)
r2_score_validation = r2_score(Y_validation, pred_validation)
print('mse_train', mse_train)
print('mae_train', mae_train)
print('rmse_train', rmse_train)
print('r2_score_train', r2_score_train)
print('\n')
print('mse_validation', mse_validation)
print('mae_validation', mae_validation)
print('rmse_validation', rmse_validation)
print('r2_score_validation', r2_score_validation)
#최종적으로 test데이터를 예측하고 성능평가를 진행하고
#예측 결과를 csv로 만들어 제출하자
pred_test = RFR.predict(X_test_sc)
mse_test = mean_squared_error(y_test, pred_test)
mae_test = mean_absolute_error(y_test, pred_test)
rmse_test = np.sqrt(mse_test)
r2_score_test = r2_score(y_test, pred_test)
print('\n')
print('mse_test',mse_test)
print('mae_test',mae_test)
print('rmse_test',rmse_test)
print('r2_score_test',r2_score_test)
pd.DataFrame({'ID':x_test['ID'], 'Chance of Admit':pred_test}).to_csv('school.csv', index=False)
| GRE Score | TOEFL Score | University Rating | SOP | LOR | CGPA | Research | |
|---|---|---|---|---|---|---|---|
| 0 | 327 | 114 | 3 | 3.0 | 3.0 | 9.02 | 0 |
| 1 | 321 | 109 | 4 | 4.0 | 4.0 | 8.68 | 1 |
| 2 | 301 | 99 | 3 | 2.5 | 2.0 | 8.45 | 1 |
| 3 | 317 | 106 | 2 | 2.0 | 3.5 | 8.12 | 0 |
| 4 | 321 | 111 | 3 | 3.5 | 4.0 | 8.83 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 395 | 312 | 109 | 2 | 2.5 | 4.0 | 9.02 | 0 |
| 396 | 316 | 104 | 3 | 3.0 | 3.5 | 8.00 | 1 |
| 397 | 295 | 99 | 2 | 2.5 | 3.0 | 7.65 | 0 |
| 398 | 296 | 95 | 2 | 3.0 | 2.0 | 7.54 | 1 |
| 399 | 304 | 103 | 5 | 5.0 | 3.0 | 7.92 | 0 |
400 rows × 7 columns
| ID | Chance of Admit | |
|---|---|---|
| 0 | 0 | 0.61 |
| 1 | 1 | 0.69 |
| 2 | 2 | 0.68 |
| 3 | 3 | 0.73 |
| 4 | 4 | 0.77 |
mse_train 0.0005432898507462684 mae_train 0.016814179104477644 rmse_train 0.023308578908768085 r2_score_train 0.9729979375154096 mse_validation 0.0073422296969696986 mae_validation 0.0646742424242424 rmse_validation 0.08568681168633653 r2_score_validation 0.6351237590655042 mse_test 0.0024170792999999944 mae_test 0.036169 rmse_test 0.04916380070743102 r2_score_test 0.8666167456988487
Out[80]:
| ID | Chance of Admit | |
|---|---|---|
| 0 | 13 | 0.6771 |
| 1 | 16 | 0.7251 |
| 2 | 23 | 0.8339 |
| 3 | 25 | 0.5957 |
| 4 | 29 | 0.5699 |
| ... | ... | ... |
| 95 | 467 | 0.5985 |
| 96 | 484 | 0.9164 |
| 97 | 485 | 0.6600 |
| 98 | 489 | 0.8742 |
| 99 | 499 | 0.5354 |
100 rows × 2 columns
레드 와인 퀄리티 예측 데이터¶
데이터 설명 : 레드 와인 퀼리티 예측문제 (종속변수 : quality)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_test.csv¶
In [134]:
#데이터 불러오기
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_test.csv')
#id컬럼은 제거해주자
X_train = x_train.drop(columns='ID')
X_test = x_test.drop(columns='ID')
display(X_train)
display(y_train.head())
y_train = y_train['quality']
y_test = y_test['quality']
#print(X_train.info())
#print(X_train.nunique()) #범주형 컬럼이 없다??? 다수치형
#print(X_train.isnull().sum()) #널 값은 없다
#print(X_test.isnull().sum()) #널 값은 없다
#일단은 수치형 컬럼 이상치를 제거해주자
def solution(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3 - q1
upper = q3 + (1.5*iqr)
lower = q1 - (1.5*iqr)
data[col] = data[col].map(lambda x: data[col].mean() if (x>upper) or (x<lower) else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
solution(X_train,i)
solution(X_test,i)
#데이터를 분할해 주자!!!
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state = 43)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#수치형 데이터 스케일링 해주자(StandardScaler)
obj_col = X_train.select_dtypes(include='object').columns
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))
X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns=obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=obj_col).columns)
X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns=obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
X_test_sc[i] = X_test[i]
#수치형 데이터 로그 변환 해주자
import numpy as np
for i in X_train_sc.select_dtypes(exclude='object').columns:
if X_train_sc[i].min()<0:
X_train_sc[i] = X_train_sc[i].map(lambda x : x+abs(X_train_sc[i].min()))
X_train_sc[i] = np.log1p(X_train_sc[i])
if X_validation_sc[i].min()<0:
X_validation_sc[i] = X_validation_sc[i].map(lambda x : x+abs(X_validation_sc[i].min()))
X_validation_sc[i] = np.log1p(X_validation_sc[i])
if X_test_sc[i].min()<0:
X_test_sc[i] = X_test_sc[i].map(lambda x : x+abs(X_test_sc[i].min()))
X_test_sc[i] = np.log1p(X_test_sc[i])
#오프젝트 컬럼이 없기 떄문에 인코딩은 해줄 필요가 없음 바로 모델링하자
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=40)
RFR.fit(X_train_sc, Y_train)
pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse_train = mean_squared_error(Y_train, pred_train)
mse_validation = mean_squared_error(Y_validation, pred_validation)
mae_train = mean_absolute_error(Y_train, pred_train)
mae_validation = mean_absolute_error(Y_validation, pred_validation)
rmse_train = np.sqrt(mse_train)
rmse_validation = np.sqrt(mse_validation)
r2_score_train = r2_score(Y_train, pred_train)
r2_score_validation = r2_score(Y_validation, pred_validation)
print('mse_train',mse_train)
print('mae_train',mae_train)
print('rmse_train',rmse_train)
print('r2_score_train',r2_score_train)
print('\n')
print('mse_validation',mse_validation)
print('mae_validation',mae_validation)
print('rmse_validation',rmse_validation)
print('r2_score_validation',r2_score_validation)
#최종적으로 test데이터 예측해 성능평가를 해보고
pred_test = RFR.predict(X_test_sc)
mse_test = mean_squared_error(y_test, pred_test)
mae_test = mean_absolute_error(y_test, pred_test)
rmse_test = np.sqrt(mse_test)
r2_score_test = r2_score(y_test, pred_test)
print('\n')
print('mse_test',mse_test)
print('mae_test',mae_test)
print('rmse_test',rmse_test)
print('r2_score_test',r2_score_test)
pd.DataFrame({'ID':x_test['ID'], 'quality':pred_test}).to_csv('wine.csv', index=False)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.6 | 0.44 | 0.68 | 4.1 | 0.114 | 6.0 | 24.0 | 0.99700 | 3.06 | 0.66 | 13.4 |
| 1 | 7.0 | 0.60 | 0.30 | 4.5 | 0.068 | 20.0 | 110.0 | 0.99914 | 3.30 | 1.17 | 10.2 |
| 2 | 8.0 | 0.43 | 0.36 | 2.3 | 0.075 | 10.0 | 48.0 | 0.99760 | 3.34 | 0.46 | 9.4 |
| 3 | 7.9 | 0.53 | 0.24 | 2.0 | 0.072 | 15.0 | 105.0 | 0.99600 | 3.27 | 0.54 | 9.4 |
| 4 | 8.0 | 0.45 | 0.23 | 2.2 | 0.094 | 16.0 | 29.0 | 0.99620 | 3.21 | 0.49 | 10.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1274 | 9.4 | 0.40 | 0.47 | 2.5 | 0.087 | 6.0 | 20.0 | 0.99772 | 3.15 | 0.50 | 10.5 |
| 1275 | 11.0 | 0.30 | 0.58 | 2.1 | 0.054 | 7.0 | 19.0 | 0.99800 | 3.31 | 0.88 | 10.5 |
| 1276 | 9.3 | 0.39 | 0.44 | 2.1 | 0.107 | 34.0 | 125.0 | 0.99780 | 3.14 | 1.22 | 9.5 |
| 1277 | 7.0 | 0.50 | 0.14 | 1.8 | 0.078 | 10.0 | 23.0 | 0.99636 | 3.53 | 0.61 | 10.4 |
| 1278 | 6.5 | 0.67 | 0.00 | 4.3 | 0.057 | 11.0 | 20.0 | 0.99488 | 3.45 | 0.56 | 11.8 |
1279 rows × 11 columns
| ID | quality | |
|---|---|---|
| 0 | 1 | 6 |
| 1 | 2 | 5 |
| 2 | 3 | 5 |
| 3 | 4 | 6 |
| 4 | 5 | 6 |
mse_train 0.05081869158878505 mae_train 0.1622663551401869 rmse_train 0.22543001483561378 r2_score_train 0.9207777306883845 mse_validation 0.4251274231678487 mae_validation 0.5014420803782507 rmse_validation 0.6520179623046045 r2_score_validation 0.37446445264958395 mse_test 0.3656053125 mae_test 0.44378124999999996 rmse_test 0.6046530513443226 r2_score_test 0.3963173374613004
Out[134]:
| ID | quality | |
|---|---|---|
| 0 | 0 | 5.98 |
| 1 | 7 | 6.28 |
| 2 | 12 | 5.06 |
| 3 | 13 | 4.66 |
| 4 | 25 | 5.08 |
| ... | ... | ... |
| 315 | 1587 | 5.24 |
| 316 | 1589 | 5.59 |
| 317 | 1591 | 5.02 |
| 318 | 1592 | 5.13 |
| 319 | 1595 | 6.28 |
320 rows × 2 columns
현대 차량 가격 문제 데이터¶
데이터 설명 : 현대 차량 가격문제 (종속변수 : price)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_test.csv¶
In [189]:
#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/hyundai/y_test.csv')
#ID컬럼은 삭제하자
X_train = x_train.drop(columns='ID')
X_test = x_test.drop(columns='ID')
display(X_train)
display(y_train.head())
y_train = y_train['price']
y_test = y_test['price']
#print(X_train.info())
#print(X_train.nunique())
#print(X_test.isnull().sum()) #널 값은 없다
#print(X_train.isnull().sum()) #널 값은 없다
## 수치형 컬럼이지만 범주형인 데이터 타입을 object로 변경
X_train['year'] = X_train['year'].astype('object')
X_test['year'] = X_test['year'].astype('object')
#수치형 데이터 이상치를 제거해주자
def solution(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3 - q1
upper = q3 + (1.5*iqr)
lower = q1 - (1.5*iqr)
data[col] = data[col].map(lambda x : data[col].mean() if (x>upper) or (x<lower) else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
solution(X_train, i)
solution(X_test, i)
#데이터 분할을 해주자(학습데이터 0.67, 검증데이터 0.33)
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#수치형 데이터 스케일링 해주자
obj_col = X_train.select_dtypes(include='object').columns
#print(obj_col)
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
sds.fit(X_train.drop(columns=obj_col))
X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns=obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=obj_col).columns)
X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns=obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
X_test_sc[i] = X_test[i]
#수치형 데이터 로그변환 해주자
import numpy as np
for i in X_train_sc.select_dtypes(exclude='object').columns:
if X_train_sc[i].min() < 0:
X_train_sc[i] = X_train_sc[i].map(lambda x : x+abs(X_train_sc[i].min()))
X_train_sc[i] = np.log1p(X_train_sc[i])
if X_validation_sc[i].min() < 0:
X_validation_sc[i] = X_validation_sc[i].map(lambda x : x+abs(X_validation_sc[i].min()))
X_validation_sc[i] = np.log1p(X_validation_sc[i])
if X_test_sc[i].min() < 0:
X_test_sc[i] = X_test_sc[i].map(lambda x : x+abs(X_test_sc[i].min()))
X_test_sc[i] = np.log1p(X_test_sc[i])
#display(X_validation_sc)
#모델링 하기전 인코딩을 해주자
#fuelType, transmission 컬럼은 원핫인코딩
#model, year 컬름은 범주의 수가 많으므로 라벨 인코딩 해주자
X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for i in ['model','year']:
X_full[i] = labelencoder.fit_transform(X_full[i])
X_full = pd.get_dummies(X_full)
X_train_sc = X_full[:len(X_train_sc)]
X_validation_sc = X_full[len(X_train_sc):len(X_train_sc)+len(X_validation_sc)]
X_test_sc = X_full[len(X_train_sc)+len(X_validation_sc):]
#RandomForestRegressor을 사용한 모델링
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(X_train_sc, Y_train)
pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)
#예측데이터와 실제데이터를 비교해 성능평가지표를 사용해 평가 ㄱㄱ
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)
train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)
train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)
train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)
print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)
#최종적으로 test데이터 예측후 성능평가해보고
#결과를 csv파일로 저장하기
pred_test = RFR.predict(X_test_sc)
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)
print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)
pd.DataFrame({'ID':x_test['ID'], 'price':pred_test}).to_csv('handa_car.csv', index=False)
| model | year | transmission | mileage | fuelType | tax(£) | mpg | engineSize | |
|---|---|---|---|---|---|---|---|---|
| 0 | I30 | 2019 | Manual | 21 | Petrol | 150 | 34.0 | 2.0 |
| 1 | Santa Fe | 2018 | Semi-Auto | 10500 | Diesel | 145 | 39.8 | 2.2 |
| 2 | Tucson | 2017 | Manual | 29968 | Diesel | 30 | 61.7 | 1.7 |
| 3 | Kona | 2018 | Manual | 27317 | Petrol | 145 | 52.3 | 1.0 |
| 4 | Tucson | 2018 | Semi-Auto | 31459 | Diesel | 145 | 57.7 | 1.7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3883 | Tucson | 2019 | Manual | 10200 | Petrol | 145 | 34.9 | 1.6 |
| 3884 | I10 | 2019 | Manual | 9487 | Petrol | 145 | 60.1 | 1.0 |
| 3885 | Tucson | 2016 | Manual | 25224 | Diesel | 30 | 61.7 | 1.7 |
| 3886 | Kona | 2019 | Manual | 8550 | Petrol | 145 | 44.1 | 1.0 |
| 3887 | Kona | 2018 | Manual | 26615 | Petrol | 145 | 52.3 | 1.0 |
3888 rows × 8 columns
| ID | price | |
|---|---|---|
| 0 | 0 | 23995 |
| 1 | 1 | 28490 |
| 2 | 2 | 13251 |
| 3 | 3 | 14990 |
| 4 | 4 | 17591 |
train_mse 245345.66791365875 train_mae 326.63128535810705 train_rmse 495.32380107729404 train_r2_score 0.9929084080931919 validation_mse 1631374.6119890925 validation_mae 857.5842187138968 validation_rmse 1277.2527596326001 validation_r2_score 0.9526308751437623 test_mse 8179892.77387731 test_mae 765.227514489026 test_rmse 2860.0511837862814 test_r2_score 0.8009698904317185
In [ ]: