빅데이터분석기사 준비

빅분기 2유형 문제연습(회귀-1)

세용용용용 2023. 6. 5. 15:46

학생성적 예측 데이터¶

데이터 설명 : 학생성적 예측(종속변수 : G3)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv ¶

In [2]:

#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv')

display(x_train)
display(y_train.head(5))

#print(x_train.info())
#print(x_train.nunique())
#print(x_train.isnull().sum()) #널 값은 없다
#나중에 스케일 age, absences, G1,G2 값을 해주자

#일단을 StudentID 컬럼을 제거해주자
X_train = x_train.drop(columns='StudentID')
X_test = x_test.drop(columns='StudentID')
y_train = y_train['G3']
y_test = y_test['G3']

#가장 우선으로 숫자형 컬럼들 이상치를 제거해주자
def solution(data, col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3 - q1
    
    upper = q3 + (iqr*1.5)
    lower = q1 - (iqr*1.5)
    
    data[col] = data[col].map(lambda x : data[col].mean() if x>upper or x<lower else x)
    return data
for i in X_train.select_dtypes(exclude='object').columns:
    X_train = solution(X_train,i)
for i in X_test.select_dtypes(exclude='object').columns:
    X_test = solution(X_test,i)
    

    
#로그변환을 해주자
#데이터의 분포를 변환하여 비대칭성을 완화하거나 
#오른쪽으로 치우친 분포를 정규 분포에 가깝게 만드는 데 사용됩니다.
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
    if X_train[i].min() < 0:
        X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
    X_train[i] = np.log1p(X_train[i])

for i in X_test.select_dtypes(exclude='object').columns:
    if X_test[i].min() < 0:
        X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
    X_test[i] = np.log1p(X_test[i])


#이제 데이터를 분할 해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#스케일링 하게되면 인덱스가 리셋됨 그렇게되면 다시 objet컬럼이랑 매핑시켜야되는데
#인덱스가 다르므로 메핑이 안됨 그러니 스케일링 전 인덱스 리셋을 시켜주는거다
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)

#숫자형 데이터 스케일링 해주자
from sklearn.preprocessing import StandardScaler
object_columns = X_train.select_dtypes(include='object').columns

scs = StandardScaler()
scs.fit(X_train.drop(columns=object_columns))
X_train_sc = scs.transform(X_train.drop(columns=object_columns))
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.drop(columns=object_columns).columns)
for i in object_columns:
    X_train_sc[i] = X_train[i]

X_validation_sc = scs.transform(X_validation.drop(columns=object_columns))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=object_columns).columns)
for i in object_columns:
    X_validation_sc[i] = X_validation[i]

#마지막으로 모델을 돌리기전 object컬럼 인코딩을 시켜줘야됨
X_train_sc.select_dtypes(include='object').nunique() #범주를 확인해보니 그렇게 많지않음
#원핫인코딩 레츠고
X_train_sc = pd.get_dummies(X_train_sc)
X_validation_sc = pd.get_dummies(X_validation_sc)

from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=42)
RFR.fit(X_train_sc, Y_train)

pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)


#실제 데이터와 예측한 데이터를 비교해 성능평가 ㄱ ㄱ 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)

train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)

train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)

train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)

print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)

#최종적으로 test데이터를 예측해 csv파일로 제출
obj_col = X_test.select_dtypes(include='object')
#test데이터 스케일링
X_test_sc = scs.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.drop(columns=obj_col).columns)
for i in obj_col:
    X_test_sc[i] = X_test[i]
X_test_sc

#원핫인코딩 수행
X_test_sc = pd.get_dummies(X_test_sc)
X_test_sc

#모델 예측 수행
pred_test = RFR.predict(X_test_sc)

#실제데이터와 예측데이터 성능평가지표 ㄱ ㄱ ㄱ
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)

print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)

pd.DataFrame({'StudentID':x_test['StudentID'], 'G3':pred_test}).to_csv('result.csv', index=False)

	StudentID	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	...	romantic	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2
0	1714	GP	F	18	U	GT3	T	4	3	other	...	no	4	3	3	1	1	3	0	14	13
1	1254	GP	F	17	U	GT3	T	4	3	health	...	yes	4	4	3	1	3	4	0	13	15
2	1639	GP	F	16	R	GT3	T	4	4	health	...	no	2	4	4	2	3	4	6	10	11
3	1118	GP	M	16	U	GT3	T	4	4	services	...	no	5	3	3	1	3	5	0	15	13
4	1499	GP	M	19	U	GT3	T	3	2	services	...	yes	4	5	4	1	1	4	0	5	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
673	1074	GP	M	15	U	GT3	T	4	4	services	...	no	5	3	3	1	1	5	4	10	13
674	1044	GP	M	15	R	GT3	T	4	4	other	...	yes	1	3	5	3	5	1	8	12	10
675	1078	GP	M	17	U	LE3	T	4	4	other	...	no	4	1	1	2	2	5	0	12	13
676	1055	MS	M	17	R	GT3	T	1	1	other	...	yes	4	5	5	1	3	2	0	10	9
677	1394	GP	F	17	U	GT3	T	3	2	health	...	no	5	2	2	1	2	5	0	18	18

678 rows × 33 columns

	StudentID	G3
0	1714	14
1	1254	15
2	1639	11
3	1118	13
4	1499	0

train_mse 0.38765121445178663
train_mae 0.3789489720998532
train_rmse 0.6226164264230318
train_r2_score 0.9758410777214178


validation_mse 1.9441621305803574
validation_mae 0.8704918154761906
validation_rmse 1.3943321450000201
validation_r2_score 0.8361182581646641


test_mse 3.1833930596539166
test_mae 1.0744653916211293
test_rmse 1.7842065630565078
test_r2_score 0.7929832649002089

중고차 가격 예측 데이터¶

데이터 설명 : 중고차 가격 예측 데이터(종속변수 : G3)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv ¶

In [114]:

#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv')

display(x_train)
display(y_train.head())


#일단 carID컬럼은 제거해주자
X_train = x_train.drop(columns='carID')
X_test = x_test.drop(columns='carID')
y_train = y_train['price']
y_test = y_test['price']


#print(X_train.info())
#print(X_train.nunique()) #year값은 숫자로 표현되지만 범주형 데이터이기 떄문에
#라벨인코딩 해주기 위해 object로 변경하자
#print(x_train.isnull().sum()) #널값은 없다!!
X_train['year'] = X_train['year'].astype('object')
X_test['year'] = X_test['year'].astype('object')


#일단 수치형 데이터들 이상치를 대체 해주자
def solution(data, col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3-q1
    
    upper = q3+(iqr*1.5)
    lower = q1-(iqr*1.5)
    
    data[col] = data[col].map(lambda x : data[col].mean() if x>upper or x<lower else x)
    return data

for i in X_train.select_dtypes(exclude='object').columns:
    solution(X_train,i)
for i in X_test.select_dtypes(exclude='object').columns:
    solution(X_test,i)


#수치형 데이터 로그변환 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
    if X_train[i].min() < 0:
        X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
    X_train[i] = np.log1p(X_train[i])
    
for i in X_test.select_dtypes(exclude='object').columns:
    if X_test[i].min() < 0:
        X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
    X_test[i] = np.log1p(X_test[i])
    
    
#데이터를 분할 해주자!!
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)


#수치형 데이터 스케일링 해주자
from sklearn.preprocessing import StandardScaler

obj_col = X_train.select_dtypes(include='object').columns
sds = StandardScaler()
sds.fit(X_train.drop(columns=obj_col))
X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns=obj_col).columns)
for i in obj_col:
    X_train_sc[i] = X_train[i]
X_train_sc

obj_col = X_validation.select_dtypes(include='object').columns
X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=obj_col).columns)
for i in obj_col:
    X_validation_sc[i] = X_validation[i]
X_validation_sc

#테스트 데이터 수치형 데이터 스케일링 해주자
obj_col = X_test.select_dtypes(include='object').columns
X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.drop(columns=obj_col).columns)
for i in obj_col:
    X_test_sc[i] = X_test[i]
X_test_sc


#마지막으로 모델링 하기전 인코딩을 해주자, 라벨인코딩해주자
X_train_sc.select_dtypes(include='object').nunique()

#라벨인코딩 수행
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

#합쳐서 라벨인코딩후 다시 나눠줄꺼임
X_train_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])

for i in X_train_full.select_dtypes(include='object').columns:
    labelencoder.fit(X_train_full[i])  
    X_train_sc[i] = labelencoder.transform(X_train_sc[i])
    X_validation_sc[i] = labelencoder.transform(X_validation_sc[i])
    X_test_sc[i] = labelencoder.transform(X_test_sc[i])


#RandomForesetRegressor로 모델링
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=42)
RFR.fit(X_train_sc, Y_train)

pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)


#예측값과 실제값을 비교해 성능평가지표를 뽑아내보자
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)

train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)

train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)

train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)

print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)

#최종적으로 test데이터 예측후 성능평가후
#예측데이터 csv로 만들어내자
#예측 데이터를 뽑아내고 성능평가지표 ㄱ ㄱ
pred_test = RFR.predict(X_test_sc)

test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)
print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)

pd.DataFrame({'carID':x_test['carID'], 'price':pred_test}).to_csv('result_car.csv', index=False)

	carID	brand	model	year	transmission	mileage	fuelType	tax	mpg	engineSize
0	13207	hyundi	Santa Fe	2019	Semi-Auto	4223	Diesel	145.0	39.8	2.2
1	17314	vauxhall	GTC	2015	Manual	47870	Diesel	125.0	60.1	2.0
2	12342	audi	RS4	2019	Automatic	5151	Petrol	145.0	29.1	2.9
3	13426	vw	Scirocco	2016	Automatic	20423	Diesel	30.0	57.6	2.0
4	16004	skoda	Scala	2020	Semi-Auto	3569	Petrol	145.0	47.1	1.0
...	...	...	...	...	...	...	...	...	...	...
4955	16898	merc	GL Class	2015	Automatic	24314	Diesel	125.0	56.6	2.1
4956	14416	bmw	6 Series	2017	Automatic	18000	Diesel	145.0	51.4	3.0
4957	15453	vw	CC	2015	Manual	84932	Diesel	30.0	60.1	2.0
4958	14666	audi	A7	2017	Semi-Auto	30150	Diesel	145.0	62.8	3.0
4959	15135	merc	SLK	2014	Automatic	40230	Diesel	150.0	56.5	3.0

4960 rows × 10 columns

	carID	price
0	13207	31995
1	17314	7700
2	12342	58990
3	13426	12999
4	16004	16990

train_mse 2609443.213644646
train_mae 795.8580186147988
train_rmse 1615.3771118982236
train_r2_score 0.9902082831648805


validation_mse 20157983.29273163
validation_mae 2311.116256169101
validation_rmse 4489.764280308224
validation_r2_score 0.9265551683753368


test_mse 16974599.22793762
test_mae 2251.9109168765576
test_rmse 4120.024178076826
test_r2_score 0.9383820310775293

의료 비용 예측 데이터¶

데이터 설명 : 의료비용 예측문제 ( 종속변수 : charges)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv ¶

In [83]:

#데이터먼저 불러오자
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv')

display(x_train)
display(y_train.head())


#일단 ID컬럼은 제거해주자 
X_train = x_train.drop(columns='ID')
X_test = x_test.drop(columns='ID')
y_train = y_train['charges']
y_test = y_test['charges']


#print(x_train.info())
#print(x_train.nunique()) 
#print(x_train.isnull().sum()) #널 값은 없다!!
#print(x_test.isnull().sum()) #널 값은 없다!!

#children은 숫자형이지만 범주형 데이터이므로 object로 변경
X_train['children'] = X_train['children'].astype('object')
X_test['children'] = X_test['children'].astype('object')

#age컬럼 범주형 데이터로 바꿔주자

def solution(x):
    if 15<=x<=19:
        return '10대후반'
    elif 20<=x<=24:
        return '20대초반'
    elif 25<=x<=29:
        return '20대후반'
    elif 30<=x<=34:
        return '30대초반'
    elif 35<=x<=39:
        return '30대후반'
    elif 40<=x<=44:
        return '40대초반'
    elif 45<=x<=49:
        return '40대후반'
    elif 50<=x<=54:
        return '50대초반'
    elif 55<=x<=59:
        return '50대후반'
    elif 60<=x<=64:
        return '60대초반'
    elif 65<=x<=70:
        return '60대후반'
X_train['age'] = X_train['age'].apply(solution)
X_test['age'] = X_test['age'].apply(solution)


#수치형 데이터 이상치 제거해주자!!
def solution(data,col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3-q1
    
    upper = q3+(1.5*iqr)
    lower = q3-(1.5*iqr)
    
    data[col] = data[col].map(lambda x : data[col].mean() if (x>upper) or (x<lower) else x)
    return data

for i in X_train.select_dtypes(exclude='object').columns:
    solution(X_train, i)
    solution(X_test, i)
    
#수치형 데이터 로그변환 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
    if X_train[i].min() < 0:
        X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
    X_train[i] = np.log1p(X_train[i])
    
    if X_test[i].min() < 0:
        X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
    X_test[i] = np.log1p(X_test[i])


#데이터를 분할해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#스케일링 해주면 인덱스가 리셋되어 object컬럼을 다시 매핑시켜줄떄
#인덱스가 일치하지 않게됨 즉 미리 리셋을 시켜주자!!!
#X_train.reset_index(drop=True, inplace=True)
#X_validation.reset_index(drop=True, inplace=True)


#근대 수치형 데이터가 하나밖에 없어서 굳이 스케일링 해줄필요가 없을듯...


#인코딩을 해주자 범주의 수가 많은게 있으므로 age, children 라벨인코딩 해주자
#print(X_train.info())
#X_train.nunique()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X_full = pd.concat([X_train, X_validation, X_test])

for i in ['age','children']:
    labelencoder.fit(X_full[i])
    X_train[i] = labelencoder.transform(X_train[i])
    X_validation[i] = labelencoder.transform(X_validation[i])
    X_test[i] = labelencoder.transform(X_test[i])



#나머지 object컬럼들은 원핫인코딩을 진행 
X_train = pd.get_dummies(X_train)
X_validation = pd.get_dummies(X_validation)
X_test = pd.get_dummies(X_test)


#RandomForestRegressor을 사용해 모델링 진행 ㄲ
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=40, max_depth=5)
RFR.fit(X_train, Y_train)

pred_train = RFR.predict(X_train)
pred_validation = RFR.predict(X_validation)


#실제데이터와 예측한데이터를 비교해 성능평가지표로 확인해보자
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)

train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)

train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)

train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)

print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)


#최종적으로 test데이터의 예측해 성능을 평가해보고
#예측 수행 결과를 csv로 저정해 제출 ㄱ ㄱ
pred_test = RFR.predict(X_test)

test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)

print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)

pd.DataFrame({'ID':x_test['ID'], 'charges':pred_test})

	ID	age	sex	bmi	children	smoker	region
0	2	35	female	35.860	2	no	southeast
1	3	28	female	23.845	2	no	northwest
2	4	23	female	32.780	2	yes	southeast
3	6	52	female	25.300	2	yes	southeast
4	7	63	male	39.800	3	no	southwest
...	...	...	...	...	...	...	...
1065	1332	22	female	30.400	0	yes	northwest
1066	1333	50	male	31.825	0	yes	northeast
1067	1335	18	female	31.130	0	no	southeast
1068	1336	56	male	26.695	1	yes	northwest
1069	1337	33	female	28.270	1	no	southeast

1070 rows × 7 columns

	ID	charges
0	2	5836.52040
1	3	4719.73655
2	4	36021.01120
3	6	24667.41900
4	7	15170.06900

train_mse 17290074.263040222
train_mae 2365.2665939454187
train_rmse 4158.1335071207395
train_r2_score 0.8932793742335754


validation_mse 26716431.505335
validation_mae 2850.574201752695
validation_rmse 5168.7940087930565
validation_r2_score 0.7783628022700004


test_mse 21885253.97549225
test_mae 2694.656751489809
test_rmse 4678.167801126019
test_r2_score 0.8578214388654344

Out[83]:

	ID	charges
0	0	5489.221616
1	1	2803.308217
2	5	7845.579176
3	21	4715.347076
4	26	13417.702979
...	...	...
263	1327	12542.502713
264	1328	14033.445264
265	1329	3558.981981
266	1331	34935.412151
267	1334	13859.033927

268 rows × 2 columns

킹카운티 주거지 가격예측문제 데이터¶

데이터 설명 : 킹카운티 주거지 가격 예측문제 ( 종속변수 : price )¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv ¶

x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv ¶

In [161]:

#데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv')

x_train['date'] = x_train['date'].map(lambda x : x[:8])
x_test['date'] = x_test['date'].map(lambda x : x[:8])

x_train['data_year'] =  x_train['date'].map(lambda x : x[:4])
x_train['data_month'] =  x_train['date'].map(lambda x : x[4:6])
x_train['data_day'] =  x_train['date'].map(lambda x : x[6:])

x_test['data_year'] =  x_test['date'].map(lambda x : x[:4])
x_test['data_month'] =  x_test['date'].map(lambda x : x[4:6])
x_test['data_day'] =  x_test['date'].map(lambda x : x[6:])

display(x_train)
display(y_train.head())



#id, ID, date 컬럼 제거해주자
X_train = x_train.drop(columns=['ID','id','date'])
X_test = x_test.drop(columns=['ID','id','date'])
y_train = y_train['price']
y_test = y_test['price']


#print(X_train.info())
#print(X_train.nunique())
#print(x_train.isnull().sum()) #널 값은 없음



#bedrooms,bathrooms,floors,waterfront,view,condition,grade
#이 여섯개 컬럼들은 수치형 데이터이지만 범주를 의미하므로 범주형 데이터로 바꿔줌
range_data = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade']
for i in range_data:
    X_train[i] = X_train[i].astype('object')
    X_test[i] = X_test[i].astype('object')


    
#수치형 데이터 이상치를 제거해주자
def solution(data,col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3-q1
    
    upper = q3+(1.5*iqr)
    lower = q1-(1.5*iqr)
    data[col] = data[col].map(lambda x : data[col].mean() if x>upper or x<lower else x)
    return data

for i in X_train.select_dtypes(exclude='object').columns:
    solution(X_train,i)
    solution(X_test,i)
    

#데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)


#StandardScaler() 사용 스케일링 해주자
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
obj_col = X_train.select_dtypes(include='object').columns
sds.fit(X_train.drop(columns=obj_col))

X_train_sc = sds.transform(X_train.drop(columns=obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns=obj_col).columns)

X_validation_sc = sds.transform(X_validation.drop(columns=obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns=obj_col).columns)

X_test_sc = sds.transform(X_test.drop(columns=obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns=obj_col).columns)

for i in obj_col:
    X_train_sc[i] = X_train[i]
    X_validation_sc[i] = X_validation[i]
    X_test_sc[i] = X_test[i]
    
    
#수치형 데이터 로그 변환 해주기!!
import numpy as np
for i in X_train_sc.select_dtypes(exclude='object').columns:
    if X_train_sc[i].min() < 0:
        X_train_sc[i] = X_train_sc[i].map(lambda x : x+abs(X_train_sc[i].min()))
        X_validation_sc[i] = X_validation_sc[i].map(lambda x : x+abs(X_validation_sc[i].min()))
        X_test_sc[i] = X_test_sc[i].map(lambda x : x+abs(X_test_sc[i].min()))
    X_train_sc[i] = np.log1p(X_train_sc[i])
    X_validation_sc[i] = np.log1p(X_validation_sc[i])
    X_test_sc[i] = np.log1p(X_test_sc[i])

    
#모델링 하기전에 object데이터 인코딩 해주자 범주의 개수가 많은 컬럼이 많음
#그러므로 라벨인코딩 해주자 그냥
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
for i in X_full.select_dtypes(include='object'):
    labelencoder.fit(X_full[i])
    X_train_sc[i] = labelencoder.transform(X_train_sc[i])
    X_validation_sc[i] = labelencoder.transform(X_validation_sc[i])
    X_test_sc[i] = labelencoder.transform(X_test_sc[i])

    
#RandomForestRegressor을 사용해 회귀분석 실시해주자
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=40)
RFR.fit(X_train_sc, Y_train)

pred_train = RFR.predict(X_train_sc)
pred_validation = RFR.predict(X_validation_sc)


#실제 데이터와 예측데이터를 비교해 성능평가지표 확인
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

train_mse = mean_squared_error(Y_train, pred_train)
validation_mse = mean_squared_error(Y_validation, pred_validation)

train_mae = mean_absolute_error(Y_train, pred_train)
validation_mae = mean_absolute_error(Y_validation, pred_validation)

train_rmse = np.sqrt(train_mse)
validation_rmse = np.sqrt(validation_mse)

train_r2_score = r2_score(Y_train, pred_train)
validation_r2_score = r2_score(Y_validation, pred_validation)

print('train_mse',train_mse)
print('train_mae',train_mae)
print('train_rmse',train_rmse)
print('train_r2_score',train_r2_score)
print('\n')
print('validation_mse',validation_mse)
print('validation_mae',validation_mae)
print('validation_rmse',validation_rmse)
print('validation_r2_score',validation_r2_score)

pred_test = RFR.predict(X_test_sc)
test_mse = mean_squared_error(y_test, pred_test)
test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = np.sqrt(test_mse)
test_r2_score = r2_score(y_test, pred_test)

print('\n')
print('test_mse',test_mse)
print('test_mae',test_mae)
print('test_rmse',test_rmse)
print('test_r2_score',test_r2_score)


pd.DataFrame({'ID':x_test['ID'], 'price':pred_test}).to_csv('house_csv', index=False)

	ID	id	date	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	data_year	data_month	data_day
0	2	8651400730	20150428	3	1.00	840	5525	1.0	0	0	...	1969	0	98042	47.3607	-122.085	920	5330	2015	04	28
1	3	3163600130	20150317	3	1.00	1250	8000	1.0	0	0	...	1956	0	98146	47.5065	-122.337	1040	6973	2015	03	17
2	4	5045700330	20140725	4	2.50	2200	6400	2.0	0	0	...	2010	0	98059	47.4856	-122.156	2600	5870	2014	07	25
3	5	1036100130	20140808	3	2.50	1980	39932	2.0	0	0	...	1994	0	98011	47.7433	-122.196	2610	12769	2014	08	08
4	6	7696630080	20140506	3	1.75	1690	7735	1.0	0	0	...	1976	0	98001	47.3324	-122.280	1580	7503	2014	05	06
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17285	21607	7420100040	20150422	3	1.50	1840	10956	1.0	0	0	...	1970	0	98033	47.6746	-122.164	1680	10950	2015	04	22
17286	21608	6372000101	20140820	3	2.00	1200	2016	1.0	0	1	...	1931	0	98116	47.5811	-122.404	1730	4520	2014	08	20
17287	21609	7212650200	20141027	3	2.50	2180	15484	1.0	0	0	...	1992	0	98003	47.2688	-122.309	2090	10775	2014	10	27
17288	21610	3832711040	20150424	5	2.75	3030	7000	1.0	0	0	...	1978	0	98032	47.3661	-122.280	1790	7330	2015	04	24
17289	21612	9526600710	20140724	4	2.50	3000	5639	2.0	0	0	...	2008	0	98052	47.7066	-122.115	3000	4587	2014	07	24

17290 rows × 24 columns

	ID	price
0	2	191000.0
1	3	234900.0
2	4	460000.0
3	5	442000.0
4	6	197000.0

train_mse 2885603782.194446
train_mae 27752.40652451657
train_rmse 53717.8162455851
train_r2_score 0.9803749755751702


validation_mse 20198174413.278122
validation_mae 76291.2798212408
validation_rmse 142120.2814987295
validation_r2_score 0.8390583266654865


test_mse 26617275691.48041
test_mae 98256.55686328938
test_rmse 163148.01773690176
test_r2_score 0.8061931224674896

빅분기 2유형 문제연습(회귀-1)

학생성적 예측 데이터¶

데이터 설명 : 학생성적 예측(종속변수 : G3)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv¶

중고차 가격 예측 데이터¶

데이터 설명 : 중고차 가격 예측 데이터(종속변수 : G3)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv¶

의료 비용 예측 데이터¶

데이터 설명 : 의료비용 예측문제 ( 종속변수 : charges)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv¶

킹카운티 주거지 가격예측문제 데이터¶

데이터 설명 : 킹카운티 주거지 가격 예측문제 ( 종속변수 : price )¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv¶

x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_test.csv ¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv ¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv ¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/x_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_train.csv ¶

x_label(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/kingcountyprice/y_test.csv ¶