빅데이터분석기사 준비
빅분기 모의고사 6회차
세용용용용
2023. 6. 22. 21:22
작업 1유형¶
In [25]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e2_p1_1.csv')
df
Out[25]:
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | NaN | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
| 1 | 0.02731 | NaN | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
| 2 | 0.02729 | NaN | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
| 3 | 0.03237 | NaN | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | NaN | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
| 502 | 0.04527 | 0.0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
| 503 | 0.06076 | 0.0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
| 504 | 0.10959 | 0.0 | 11.93 | 0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
| 505 | 0.04741 | 0.0 | 11.93 | 0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
506 rows × 14 columns
In [26]:
target = df.sort_values(by='CRIM', ascending=False)
target.head(10)['CRIM'] = target.head(10)['CRIM'].min()
mean = target.loc[target['AGE']>=80]['CRIM'].mean()
mean
C:\Users\Public\Documents\ESTsoft\CreatorTemp/ipykernel_18076/2323714510.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy target.head(10)['CRIM'] = target.head(10)['CRIM'].min()
Out[26]:
5.759386624999999
1-1에서 사용한 데이터에서 RM 중앙값으로 해당 컬럼의 결측치를 대체하라 그리고 해당 컬럼의 결측치 대치 전후의 표준편차 차이의 절댓값을 소숫점 이하 3째자리 까지 구하라¶
In [30]:
#target['RM'].fillna(target['RM'].median())
first = target['RM'].std()
second = target['RM'].fillna(target['RM'].median()).std()
print(round((first - second),3))
0.027
주어진 Dataset의 DIS 평균으로 부터 1.5 * 표준편차를 벗어나는 영역을 이상치라고 판단하고 DIS 컬럼의 이상치들의 합을 구하여라.¶
In [38]:
eiei_upper = df['DIS'].mean() + (df['DIS'].std()*1.5)
eiei_lowewr = df['DIS'].mean() - (df['DIS'].std()*1.5)
target = df.loc[(df['DIS']>eiei_upper) | (df['DIS']<eiei_lowewr)]
print(target['DIS'].sum())
404.4101
작업형 2유형¶
데이터 설명 : e-commerce 배송의 정시 도착여부 (1: 정시배송 0 : 정시미배송)¶
x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv¶
x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv¶
y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv¶
y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv¶
데이터 출처 :https://www.kaggle.com/datasets/prachi13/customer-analytics (참고, 데이터 수정)¶
x_train 데이터로 학습한 모델을 x_test에 적용하여 예측한 결과를 제출하라. 평가 지표는 f1_score이다.¶
In [92]:
#데이터 불러오기
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv')
#ID컬럼 삭제
X_train = x_train.drop(columns = 'ID')
X_test = x_test.drop(columns = 'ID')
display(X_train)
display(y_train.head())
y_train = y_train['Reached.on.Time_Y.N']
y_test = y_test['Reached.on.Time_Y.N']
#print(X_train.info())
#print(X_train.nunique())
#print(X_train.isnull().sum()) # 널 값은 없다
#print(X_test.isnull().sum()) # 널 값은 없다 개꿀개꿀
#수치형 데이터 이상치 평균으로 대체
def solution(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3 - q1
upper = q3 + (1.5*iqr)
lower = q1 - (1.5*iqr)
data[col] = data[col].map(lambda x : data[col].mean() if (x>upper) or (x<lower) else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
X_train = solution(X_train, i)
X_test = solution(X_test, i)
#데이터 분할을 해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, stratify=y_train, random_state=1)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#수치형 컬럼 로그변환을 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
if X_train[i].min()<0:
X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i]))
X_train[i] = np.log1p(X_train[i])
if X_validation[i].min()<0:
X_validation[i] = X_validation[i].map(lambda x : x+abs(X_validation[i]))
X_validation[i] = np.log1p(X_validation[i])
if X_test[i].min()<0:
X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i]))
X_test[i] = np.log1p(X_test[i])
#수치형 컬럼 StandardScaler 스케일링 해주기
obj_col = X_train.select_dtypes(include='object').columns
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))
X_train_sc = sds.transform(X_train.drop(columns = obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns = obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns = obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns = obj_col).columns)
X_test_sc = sds.transform(X_test.drop(columns = obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns = obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
X_test_sc[i] = X_test[i]
#원핫 인코딩 해주자
#display(X_test_sc)
X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
X_full = pd.get_dummies(X_full)
X_train_sc = X_full[:len(X_train_sc)]
X_validation_sc = X_full[len(X_train_sc):len(X_train_sc)+len(X_validation_sc)]
X_test_sc = X_full[len(X_train_sc)+len(X_validation_sc):]
#display(X_test_sc)
#모델선정 (랜포 아니면 xgboost)
#평가 지표는 f1_score
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=2)
RFC.fit(X_train_sc, Y_train)
pred_train = RFC.predict(X_train_sc)
pred_validation = RFC.predict(X_validation_sc)
from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train)
f1_score_validation = f1_score(Y_validation, pred_validation)
print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)
print('\n')
import xgboost as xgb
xgb = xgb.XGBClassifier(random_state=3)
xgb.fit(X_train_sc, Y_train)
pred_train = xgb.predict(X_train_sc)
pred_validation = xgb.predict(X_validation_sc)
from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train)
f1_score_validation = f1_score(Y_validation, pred_validation)
print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)
#랜포가 더 잘나왔으므로 랜포로 최적의 매개변수를 찾아보자
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(random_state=2)
parameters = {
'n_estimators' : [250,300,350],
'max_depth' : [4,5,6,7]
}
gridshcv = GridSearchCV(model, parameters, cv = 10)
gridshcv.fit(X_train_sc, Y_train)
print('최적의 매개변수', gridshcv.best_params_)
| Warehouse_block | Mode_of_Shipment | Customer_care_calls | Customer_rating | Cost_of_the_Product | Prior_purchases | Product_importance | Gender | Discount_offered | Weight_in_gms | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | Flight | 4 | 3 | 266 | 5 | high | F | 5 | 1590 |
| 1 | F | Ship | 3 | 1 | 174 | 2 | low | M | 44 | 1556 |
| 2 | F | Road | 4 | 1 | 154 | 10 | high | M | 10 | 5674 |
| 3 | F | Ship | 4 | 3 | 158 | 3 | medium | F | 27 | 1207 |
| 4 | A | Flight | 5 | 3 | 175 | 3 | low | M | 7 | 4833 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6593 | F | Road | 5 | 2 | 221 | 6 | medium | M | 4 | 1952 |
| 6594 | F | Ship | 4 | 5 | 256 | 3 | medium | M | 10 | 4504 |
| 6595 | F | Ship | 3 | 1 | 217 | 4 | medium | F | 1 | 5761 |
| 6596 | F | Road | 4 | 5 | 174 | 3 | medium | F | 8 | 5576 |
| 6597 | C | Ship | 6 | 2 | 257 | 4 | medium | M | 1 | 1513 |
6598 rows × 10 columns
| ID | Reached.on.Time_Y.N | |
|---|---|---|
| 0 | 6045 | 0 |
| 1 | 44 | 1 |
| 2 | 7940 | 1 |
| 3 | 1596 | 1 |
| 4 | 4395 | 1 |
f1_score_train 1.0
f1_score_validation 0.6880000000000002
f1_score_train 0.9653312788906009
f1_score_validation 0.6849651782056534
최적의 매개변수 {'max_depth': 6, 'n_estimators': 300}
In [97]:
#모델선정 (랜포 아니면 xgboost)
#평가 지표는 f1_score
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=2, max_depth=6, n_estimators=300)
RFC.fit(X_train_sc, Y_train)
pred_train = RFC.predict(X_train_sc)
pred_validation = RFC.predict(X_validation_sc)
from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train)
f1_score_validation = f1_score(Y_validation, pred_validation)
print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)
print('\n')
#최종적으로 test데이터 예측한 결과를 제출하라 , 평가지표는 f1_score
pred_test = RFC.predict(X_test_sc)
f1_score_test = f1_score(y_test, pred_test)
print('f1_score_test',f1_score_test)
pd.DataFrame({'ID':x_test['ID'], 'Reached.on.Time_Y.N':pred_test}).to_csv('20176516.csv', index=False)
f1_score_train 0.7357696566998893 f1_score_validation 0.680184331797235 f1_score_test 0.6694045174537987
Out[97]:
| ID | Reached.on.Time_Y.N | |
|---|---|---|
| 0 | 6811 | 1 |
| 1 | 4320 | 0 |
| 2 | 5732 | 0 |
| 3 | 7429 | 0 |
| 4 | 2191 | 1 |
| ... | ... | ... |
| 4396 | 2610 | 1 |
| 4397 | 3406 | 0 |
| 4398 | 10395 | 0 |
| 4399 | 3646 | 0 |
| 4400 | 573 | 1 |
4401 rows × 2 columns