빅데이터분석기사 준비
빅분기 2유형 문제연습(회귀-1)
세용용용용
2023. 6. 5. 15:46
학생성적 예측 데이터¶
데이터 설명 : 학생성적 예측(종속변수 : G3)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv¶
In [2]:
#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv')
display(x_train)
display(y_train.head(5))
#print(x_train.info())
#print(x_train.nunique())
#print(x_train.isnull().sum()) #널 값은 없다
#나중에 스케일 age, absences, G1,G2 값을 해주자
#일단을 StudentID 컬럼을 제거해주자
X_train = x_train.drop(columns='StudentID')
X_test = x_test.drop(columns='StudentID')
y_train = y_train['G3']
y_test = y_test['G3']
#가장 우선으로 숫자형 컬럼들 이상치를 제거해주자
def solution(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3 - q1
upper = q3 + (iqr*1.5)
lower = q1 - (iqr*1.5)
data[col] = data[col].map(lambda x : data[col].mean() if x>upper or x<lower else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
X_train = solution(X_train,i)
for i in X_test.select_dtypes(exclude='object').columns:
X_test = solution(X_test,i)
#로그변환을 해주자
#데이터의 분포를 변환하여 비대칭성을 완화하거나
#오른쪽으로 치우친 분포를 정규 분포에 가깝게 만드는 데 사용됩니다.
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
if X_train[i].min() < 0:
X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
X_train[i] = np.log1p(X_train[i])
for i in X_test.select_dtypes(exclude='object').columns:
if X_test[i].min() < 0:
X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
X_test[i] = np.log1p(X_test[i])
#이제 데이터를 분할 해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#스케일링 하게되면 인덱스가 리셋됨 그렇게되면 다시 objet컬럼이랑 매핑시켜야되는데
#인덱스가 다르므로 메핑이 안됨 그러니 스케일링 전 인덱스 리셋을 시켜주는거다
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#숫자형 데이터 스케일링 해주자
from sklearn.preprocessing import StandardScaler
object_columns = X_train.select_dtypes(include='object').columns
scs = StandardScaler()
scs.fit(X_train.drop(columns=object_columns))
X_train_sc = scs.transform(X_train.drop(columns=object_columns))
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.drop(columns=object_columns).columns)
for i in object_columns:
X_train_sc[i] = X_train[i]
X_validation_sc = scs.transform(X_validation.drop(columns=object_columns))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=object_columns).columns)
for i in object_columns:
X_validation_sc[i] = X_validation[i]
#마지막으로 모델을 돌리기전 object컬럼 인코딩을 시켜줘야됨
X_train_sc.select_dtypes(include='object').nunique() #범주를 확인해보니 그렇게 많지않음
#원핫인코딩 레츠고
X_train_sc = pd.get_dummies(X_train_sc)
X_validation_sc = pd.get_dummies(X_validation_sc)
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=42)
RFR.fit(X_train_sc, Y_train)
pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)
#실제 데이터와 예측한 데이터를 비교해 성능평가 ㄱ ㄱ
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)
train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)
train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)
train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)
print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)
#최종적으로 test데이터를 예측해 csv파일로 제출
obj_col = X_test.select_dtypes(include='object')
#test데이터 스케일링
X_test_sc = scs.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.drop(columns=obj_col).columns)
for i in obj_col:
X_test_sc[i] = X_test[i]
X_test_sc
#원핫인코딩 수행
X_test_sc = pd.get_dummies(X_test_sc)
X_test_sc
#모델 예측 수행
pred_test = RFR.predict(X_test_sc)
#실제데이터와 예측데이터 성능평가지표 ㄱ ㄱ ㄱ
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)
print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)
pd.DataFrame({'StudentID':x_test['StudentID'], 'G3':pred_test}).to_csv('result.csv', index=False)
| StudentID | school | sex | age | address | famsize | Pstatus | Medu | Fedu | Mjob | ... | romantic | famrel | freetime | goout | Dalc | Walc | health | absences | G1 | G2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1714 | GP | F | 18 | U | GT3 | T | 4 | 3 | other | ... | no | 4 | 3 | 3 | 1 | 1 | 3 | 0 | 14 | 13 |
| 1 | 1254 | GP | F | 17 | U | GT3 | T | 4 | 3 | health | ... | yes | 4 | 4 | 3 | 1 | 3 | 4 | 0 | 13 | 15 |
| 2 | 1639 | GP | F | 16 | R | GT3 | T | 4 | 4 | health | ... | no | 2 | 4 | 4 | 2 | 3 | 4 | 6 | 10 | 11 |
| 3 | 1118 | GP | M | 16 | U | GT3 | T | 4 | 4 | services | ... | no | 5 | 3 | 3 | 1 | 3 | 5 | 0 | 15 | 13 |
| 4 | 1499 | GP | M | 19 | U | GT3 | T | 3 | 2 | services | ... | yes | 4 | 5 | 4 | 1 | 1 | 4 | 0 | 5 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 673 | 1074 | GP | M | 15 | U | GT3 | T | 4 | 4 | services | ... | no | 5 | 3 | 3 | 1 | 1 | 5 | 4 | 10 | 13 |
| 674 | 1044 | GP | M | 15 | R | GT3 | T | 4 | 4 | other | ... | yes | 1 | 3 | 5 | 3 | 5 | 1 | 8 | 12 | 10 |
| 675 | 1078 | GP | M | 17 | U | LE3 | T | 4 | 4 | other | ... | no | 4 | 1 | 1 | 2 | 2 | 5 | 0 | 12 | 13 |
| 676 | 1055 | MS | M | 17 | R | GT3 | T | 1 | 1 | other | ... | yes | 4 | 5 | 5 | 1 | 3 | 2 | 0 | 10 | 9 |
| 677 | 1394 | GP | F | 17 | U | GT3 | T | 3 | 2 | health | ... | no | 5 | 2 | 2 | 1 | 2 | 5 | 0 | 18 | 18 |
678 rows × 33 columns
| StudentID | G3 | |
|---|---|---|
| 0 | 1714 | 14 |
| 1 | 1254 | 15 |
| 2 | 1639 | 11 |
| 3 | 1118 | 13 |
| 4 | 1499 | 0 |
train_mse 0.38765121445178663
train_mae 0.3789489720998532
train_rmse 0.6226164264230318
train_r2_score 0.9758410777214178
validation_mse 1.9441621305803574
validation_mae 0.8704918154761906
validation_rmse 1.3943321450000201
validation_r2_score 0.8361182581646641
test_mse 3.1833930596539166
test_mae 1.0744653916211293
test_rmse 1.7842065630565078
test_r2_score 0.7929832649002089
중고차 가격 예측 데이터¶
데이터 설명 : 중고차 가격 예측 데이터(종속변수 : G3)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv¶
In [114]:
#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv')
display(x_train)
display(y_train.head())
#일단 carID컬럼은 제거해주자
X_train = x_train.drop(columns='carID')
X_test = x_test.drop(columns='carID')
y_train = y_train['price']
y_test = y_test['price']
#print(X_train.info())
#print(X_train.nunique()) #year값은 숫자로 표현되지만 범주형 데이터이기 떄문에
#라벨인코딩 해주기 위해 object로 변경하자
#print(x_train.isnull().sum()) #널값은 없다!!
X_train['year'] = X_train['year'].astype('object')
X_test['year'] = X_test['year'].astype('object')
#일단 수치형 데이터들 이상치를 대체 해주자
def solution(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3-q1
upper = q3+(iqr*1.5)
lower = q1-(iqr*1.5)
data[col] = data[col].map(lambda x : data[col].mean() if x>upper or x<lower else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
solution(X_train,i)
for i in X_test.select_dtypes(exclude='object').columns:
solution(X_test,i)
#수치형 데이터 로그변환 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
if X_train[i].min() < 0:
X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
X_train[i] = np.log1p(X_train[i])
for i in X_test.select_dtypes(exclude='object').columns:
if X_test[i].min() < 0:
X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
X_test[i] = np.log1p(X_test[i])
#데이터를 분할 해주자!!
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#수치형 데이터 스케일링 해주자
from sklearn.preprocessing import StandardScaler
obj_col = X_train.select_dtypes(include='object').columns
sds = StandardScaler()
sds.fit(X_train.drop(columns=obj_col))
X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns=obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_train_sc
obj_col = X_validation.select_dtypes(include='object').columns
X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=obj_col).columns)
for i in obj_col:
X_validation_sc[i] = X_validation[i]
X_validation_sc
#테스트 데이터 수치형 데이터 스케일링 해주자
obj_col = X_test.select_dtypes(include='object').columns
X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.drop(columns=obj_col).columns)
for i in obj_col:
X_test_sc[i] = X_test[i]
X_test_sc
#마지막으로 모델링 하기전 인코딩을 해주자, 라벨인코딩해주자
X_train_sc.select_dtypes(include='object').nunique()
#라벨인코딩 수행
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
#합쳐서 라벨인코딩후 다시 나눠줄꺼임
X_train_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
for i in X_train_full.select_dtypes(include='object').columns:
labelencoder.fit(X_train_full[i])
X_train_sc[i] = labelencoder.transform(X_train_sc[i])
X_validation_sc[i] = labelencoder.transform(X_validation_sc[i])
X_test_sc[i] = labelencoder.transform(X_test_sc[i])
#RandomForesetRegressor로 모델링
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=42)
RFR.fit(X_train_sc, Y_train)
pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)
#예측값과 실제값을 비교해 성능평가지표를 뽑아내보자
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)
train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)
train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)
train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)
print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)
#최종적으로 test데이터 예측후 성능평가후
#예측데이터 csv로 만들어내자
#예측 데이터를 뽑아내고 성능평가지표 ㄱ ㄱ
pred_test = RFR.predict(X_test_sc)
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)
print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)
pd.DataFrame({'carID':x_test['carID'], 'price':pred_test}).to_csv('result_car.csv', index=False)
| carID | brand | model | year | transmission | mileage | fuelType | tax | mpg | engineSize | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13207 | hyundi | Santa Fe | 2019 | Semi-Auto | 4223 | Diesel | 145.0 | 39.8 | 2.2 |
| 1 | 17314 | vauxhall | GTC | 2015 | Manual | 47870 | Diesel | 125.0 | 60.1 | 2.0 |
| 2 | 12342 | audi | RS4 | 2019 | Automatic | 5151 | Petrol | 145.0 | 29.1 | 2.9 |
| 3 | 13426 | vw | Scirocco | 2016 | Automatic | 20423 | Diesel | 30.0 | 57.6 | 2.0 |
| 4 | 16004 | skoda | Scala | 2020 | Semi-Auto | 3569 | Petrol | 145.0 | 47.1 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4955 | 16898 | merc | GL Class | 2015 | Automatic | 24314 | Diesel | 125.0 | 56.6 | 2.1 |
| 4956 | 14416 | bmw | 6 Series | 2017 | Automatic | 18000 | Diesel | 145.0 | 51.4 | 3.0 |
| 4957 | 15453 | vw | CC | 2015 | Manual | 84932 | Diesel | 30.0 | 60.1 | 2.0 |
| 4958 | 14666 | audi | A7 | 2017 | Semi-Auto | 30150 | Diesel | 145.0 | 62.8 | 3.0 |
| 4959 | 15135 | merc | SLK | 2014 | Automatic | 40230 | Diesel | 150.0 | 56.5 | 3.0 |
4960 rows × 10 columns
| carID | price | |
|---|---|---|
| 0 | 13207 | 31995 |
| 1 | 17314 | 7700 |
| 2 | 12342 | 58990 |
| 3 | 13426 | 12999 |
| 4 | 16004 | 16990 |
train_mse 2609443.213644646
train_mae 795.8580186147988
train_rmse 1615.3771118982236
train_r2_score 0.9902082831648805
validation_mse 20157983.29273163
validation_mae 2311.116256169101
validation_rmse 4489.764280308224
validation_r2_score 0.9265551683753368
test_mse 16974599.22793762
test_mae 2251.9109168765576
test_rmse 4120.024178076826
test_r2_score 0.9383820310775293
의료 비용 예측 데이터¶
데이터 설명 : 의료비용 예측문제 ( 종속변수 : charges)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv¶
In [83]:
#데이터먼저 불러오자
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv')
display(x_train)
display(y_train.head())
#일단 ID컬럼은 제거해주자
X_train = x_train.drop(columns='ID')
X_test = x_test.drop(columns='ID')
y_train = y_train['charges']
y_test = y_test['charges']
#print(x_train.info())
#print(x_train.nunique())
#print(x_train.isnull().sum()) #널 값은 없다!!
#print(x_test.isnull().sum()) #널 값은 없다!!
#children은 숫자형이지만 범주형 데이터이므로 object로 변경
X_train['children'] = X_train['children'].astype('object')
X_test['children'] = X_test['children'].astype('object')
#age컬럼 범주형 데이터로 바꿔주자
def solution(x):
if 15<=x<=19:
return '10대후반'
elif 20<=x<=24:
return '20대초반'
elif 25<=x<=29:
return '20대후반'
elif 30<=x<=34:
return '30대초반'
elif 35<=x<=39:
return '30대후반'
elif 40<=x<=44:
return '40대초반'
elif 45<=x<=49:
return '40대후반'
elif 50<=x<=54:
return '50대초반'
elif 55<=x<=59:
return '50대후반'
elif 60<=x<=64:
return '60대초반'
elif 65<=x<=70:
return '60대후반'
X_train['age'] = X_train['age'].apply(solution)
X_test['age'] = X_test['age'].apply(solution)
#수치형 데이터 이상치 제거해주자!!
def solution(data,col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3-q1
upper = q3+(1.5*iqr)
lower = q3-(1.5*iqr)
data[col] = data[col].map(lambda x : data[col].mean() if (x>upper) or (x<lower) else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
solution(X_train, i)
solution(X_test, i)
#수치형 데이터 로그변환 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
if X_train[i].min() < 0:
X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
X_train[i] = np.log1p(X_train[i])
if X_test[i].min() < 0:
X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
X_test[i] = np.log1p(X_test[i])
#데이터를 분할해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#스케일링 해주면 인덱스가 리셋되어 object컬럼을 다시 매핑시켜줄떄
#인덱스가 일치하지 않게됨 즉 미리 리셋을 시켜주자!!!
#X_train.reset_index(drop=True, inplace=True)
#X_validation.reset_index(drop=True, inplace=True)
#근대 수치형 데이터가 하나밖에 없어서 굳이 스케일링 해줄필요가 없을듯...
#인코딩을 해주자 범주의 수가 많은게 있으므로 age, children 라벨인코딩 해주자
#print(X_train.info())
#X_train.nunique()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X_full = pd.concat([X_train, X_validation, X_test])
for i in ['age','children']:
labelencoder.fit(X_full[i])
X_train[i] = labelencoder.transform(X_train[i])
X_validation[i] = labelencoder.transform(X_validation[i])
X_test[i] = labelencoder.transform(X_test[i])
#나머지 object컬럼들은 원핫인코딩을 진행
X_train = pd.get_dummies(X_train)
X_validation = pd.get_dummies(X_validation)
X_test = pd.get_dummies(X_test)
#RandomForestRegressor을 사용해 모델링 진행 ㄲ
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=40, max_depth=5)
RFR.fit(X_train, Y_train)
pred_train = RFR.predict(X_train)
pred_validation = RFR.predict(X_validation)
#실제데이터와 예측한데이터를 비교해 성능평가지표로 확인해보자
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)
train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)
train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)
train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)
print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)
#최종적으로 test데이터의 예측해 성능을 평가해보고
#예측 수행 결과를 csv로 저정해 제출 ㄱ ㄱ
pred_test = RFR.predict(X_test)
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)
print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)
pd.DataFrame({'ID':x_test['ID'], 'charges':pred_test})
| ID | age | sex | bmi | children | smoker | region | |
|---|---|---|---|---|---|---|---|
| 0 | 2 | 35 | female | 35.860 | 2 | no | southeast |
| 1 | 3 | 28 | female | 23.845 | 2 | no | northwest |
| 2 | 4 | 23 | female | 32.780 | 2 | yes | southeast |
| 3 | 6 | 52 | female | 25.300 | 2 | yes | southeast |
| 4 | 7 | 63 | male | 39.800 | 3 | no | southwest |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1065 | 1332 | 22 | female | 30.400 | 0 | yes | northwest |
| 1066 | 1333 | 50 | male | 31.825 | 0 | yes | northeast |
| 1067 | 1335 | 18 | female | 31.130 | 0 | no | southeast |
| 1068 | 1336 | 56 | male | 26.695 | 1 | yes | northwest |
| 1069 | 1337 | 33 | female | 28.270 | 1 | no | southeast |
1070 rows × 7 columns
| ID | charges | |
|---|---|---|
| 0 | 2 | 5836.52040 |
| 1 | 3 | 4719.73655 |
| 2 | 4 | 36021.01120 |
| 3 | 6 | 24667.41900 |
| 4 | 7 | 15170.06900 |
train_mse 17290074.263040222
train_mae 2365.2665939454187
train_rmse 4158.1335071207395
train_r2_score 0.8932793742335754
validation_mse 26716431.505335
validation_mae 2850.574201752695
validation_rmse 5168.7940087930565
validation_r2_score 0.7783628022700004
test_mse 21885253.97549225
test_mae 2694.656751489809
test_rmse 4678.167801126019
test_r2_score 0.8578214388654344
Out[83]:
| ID | charges | |
|---|---|---|
| 0 | 0 | 5489.221616 |
| 1 | 1 | 2803.308217 |
| 2 | 5 | 7845.579176 |
| 3 | 21 | 4715.347076 |
| 4 | 26 | 13417.702979 |
| ... | ... | ... |
| 263 | 1327 | 12542.502713 |
| 264 | 1328 | 14033.445264 |
| 265 | 1329 | 3558.981981 |
| 266 | 1331 | 34935.412151 |
| 267 | 1334 | 13859.033927 |
268 rows × 2 columns
킹카운티 주거지 가격예측문제 데이터¶
데이터 설명 : 킹카운티 주거지 가격 예측문제 ( 종속변수 : price )¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv¶
x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv¶
In [161]:
#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv')
x_train['date'] = x_train['date'].map(lambda x : x[:8])
x_test['date'] = x_test['date'].map(lambda x : x[:8])
x_train['data_year'] = x_train['date'].map(lambda x : x[:4])
x_train['data_month'] = x_train['date'].map(lambda x : x[4:6])
x_train['data_day'] = x_train['date'].map(lambda x : x[6:])
x_test['data_year'] = x_test['date'].map(lambda x : x[:4])
x_test['data_month'] = x_test['date'].map(lambda x : x[4:6])
x_test['data_day'] = x_test['date'].map(lambda x : x[6:])
display(x_train)
display(y_train.head())
#id, ID, date 컬럼 제거해주자
X_train = x_train.drop(columns=['ID','id','date'])
X_test = x_test.drop(columns=['ID','id','date'])
y_train = y_train['price']
y_test = y_test['price']
#print(X_train.info())
#print(X_train.nunique())
#print(x_train.isnull().sum()) #널 값은 없음
#bedrooms,bathrooms,floors,waterfront,view,condition,grade
#이 여섯개 컬럼들은 수치형 데이터이지만 범주를 의미하므로 범주형 데이터로 바꿔줌
range_data = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade']
for i in range_data:
X_train[i] = X_train[i].astype('object')
X_test[i] = X_test[i].astype('object')
#수치형 데이터 이상치를 제거해주자
def solution(data,col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3-q1
upper = q3+(1.5*iqr)
lower = q1-(1.5*iqr)
data[col] = data[col].map(lambda x : data[col].mean() if x>upper or x<lower else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
solution(X_train,i)
solution(X_test,i)
#데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#StandardScaler() 사용 스케일링 해주자
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
obj_col = X_train.select_dtypes(include='object').columns
sds.fit(X_train.drop(columns=obj_col))
X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns=obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=obj_col).columns)
X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns=obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
X_test_sc[i] = X_test[i]
#수치형 데이터 로그 변환 해주기!!
import numpy as np
for i in X_train_sc.select_dtypes(exclude='object').columns:
if X_train_sc[i].min() < 0:
X_train_sc[i] = X_train_sc[i].map(lambda x : x+abs(X_train_sc[i].min()))
X_validation_sc[i] = X_validation_sc[i].map(lambda x : x+abs(X_validation_sc[i].min()))
X_test_sc[i] = X_test_sc[i].map(lambda x : x+abs(X_test_sc[i].min()))
X_train_sc[i] = np.log1p(X_train_sc[i])
X_validation_sc[i] = np.log1p(X_validation_sc[i])
X_test_sc[i] = np.log1p(X_test_sc[i])
#모델링 하기전에 object데이터 인코딩 해주자 범주의 개수가 많은 컬럼이 많음
#그러므로 라벨인코딩 해주자 그냥
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
for i in X_full.select_dtypes(include='object'):
labelencoder.fit(X_full[i])
X_train_sc[i] = labelencoder.transform(X_train_sc[i])
X_validation_sc[i] = labelencoder.transform(X_validation_sc[i])
X_test_sc[i] = labelencoder.transform(X_test_sc[i])
#RandomForestRegressor을 사용해 회귀분석 실시해주자
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=40)
RFR.fit(X_train_sc, Y_train)
pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)
#실제 데이터와 예측데이터를 비교해 성능평가지표 확인
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)
train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)
train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)
train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)
print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)
pred_test = RFR.predict(X_test_sc)
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)
print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)
pd.DataFrame({'ID':x_test['ID'], 'price':pred_test}).to_csv('house_csv', index=False)
| ID | id | date | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | data_year | data_month | data_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 8651400730 | 20150428 | 3 | 1.00 | 840 | 5525 | 1.0 | 0 | 0 | ... | 1969 | 0 | 98042 | 47.3607 | -122.085 | 920 | 5330 | 2015 | 04 | 28 |
| 1 | 3 | 3163600130 | 20150317 | 3 | 1.00 | 1250 | 8000 | 1.0 | 0 | 0 | ... | 1956 | 0 | 98146 | 47.5065 | -122.337 | 1040 | 6973 | 2015 | 03 | 17 |
| 2 | 4 | 5045700330 | 20140725 | 4 | 2.50 | 2200 | 6400 | 2.0 | 0 | 0 | ... | 2010 | 0 | 98059 | 47.4856 | -122.156 | 2600 | 5870 | 2014 | 07 | 25 |
| 3 | 5 | 1036100130 | 20140808 | 3 | 2.50 | 1980 | 39932 | 2.0 | 0 | 0 | ... | 1994 | 0 | 98011 | 47.7433 | -122.196 | 2610 | 12769 | 2014 | 08 | 08 |
| 4 | 6 | 7696630080 | 20140506 | 3 | 1.75 | 1690 | 7735 | 1.0 | 0 | 0 | ... | 1976 | 0 | 98001 | 47.3324 | -122.280 | 1580 | 7503 | 2014 | 05 | 06 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17285 | 21607 | 7420100040 | 20150422 | 3 | 1.50 | 1840 | 10956 | 1.0 | 0 | 0 | ... | 1970 | 0 | 98033 | 47.6746 | -122.164 | 1680 | 10950 | 2015 | 04 | 22 |
| 17286 | 21608 | 6372000101 | 20140820 | 3 | 2.00 | 1200 | 2016 | 1.0 | 0 | 1 | ... | 1931 | 0 | 98116 | 47.5811 | -122.404 | 1730 | 4520 | 2014 | 08 | 20 |
| 17287 | 21609 | 7212650200 | 20141027 | 3 | 2.50 | 2180 | 15484 | 1.0 | 0 | 0 | ... | 1992 | 0 | 98003 | 47.2688 | -122.309 | 2090 | 10775 | 2014 | 10 | 27 |
| 17288 | 21610 | 3832711040 | 20150424 | 5 | 2.75 | 3030 | 7000 | 1.0 | 0 | 0 | ... | 1978 | 0 | 98032 | 47.3661 | -122.280 | 1790 | 7330 | 2015 | 04 | 24 |
| 17289 | 21612 | 9526600710 | 20140724 | 4 | 2.50 | 3000 | 5639 | 2.0 | 0 | 0 | ... | 2008 | 0 | 98052 | 47.7066 | -122.115 | 3000 | 4587 | 2014 | 07 | 24 |
17290 rows × 24 columns
| ID | price | |
|---|---|---|
| 0 | 2 | 191000.0 |
| 1 | 3 | 234900.0 |
| 2 | 4 | 460000.0 |
| 3 | 5 | 442000.0 |
| 4 | 6 | 197000.0 |
train_mse 2885603782.194446
train_mae 27752.40652451657
train_rmse 53717.8162455851
train_r2_score 0.9803749755751702
validation_mse 20198174413.278122
validation_mae 76291.2798212408
validation_rmse 142120.2814987295
validation_r2_score 0.8390583266654865
test_mse 26617275691.48041
test_mae 98256.55686328938
test_rmse 163148.01773690176
test_r2_score 0.8061931224674896