import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e2_p1_1.csv')
df


target = df.sort_values(by='CRIM', ascending=False)
target.head(10)['CRIM'] = target.head(10)['CRIM'].min()
mean = target.loc[target['AGE']>=80]['CRIM'].mean()
mean

C:\Users\Public\Documents\ESTsoft\CreatorTemp/ipykernel_18076/2323714510.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target.head(10)['CRIM'] = target.head(10)['CRIM'].min()

5.759386624999999


#target['RM'].fillna(target['RM'].median())
first = target['RM'].std()
second = target['RM'].fillna(target['RM'].median()).std()
print(round((first - second),3))

0.027


eiei_upper = df['DIS'].mean() + (df['DIS'].std()*1.5)
eiei_lowewr = df['DIS'].mean() - (df['DIS'].std()*1.5)
target = df.loc[(df['DIS']>eiei_upper) | (df['DIS']<eiei_lowewr)]
print(target['DIS'].sum())

404.4101


#데이터 불러오기
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv')

#ID컬럼 삭제
X_train = x_train.drop(columns = 'ID')
X_test = x_test.drop(columns = 'ID')

display(X_train)
display(y_train.head())

y_train = y_train['Reached.on.Time_Y.N']
y_test = y_test['Reached.on.Time_Y.N']


#print(X_train.info())
#print(X_train.nunique())
#print(X_train.isnull().sum()) # 널 값은 없다
#print(X_test.isnull().sum()) # 널 값은 없다 개꿀개꿀


#수치형 데이터 이상치 평균으로 대체
def solution(data, col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3 - q1
    
    upper = q3 + (1.5*iqr)
    lower = q1 - (1.5*iqr)
    data[col] = data[col].map(lambda x : data[col].mean() if (x>upper) or (x<lower) else x)
    return data

for i in X_train.select_dtypes(exclude='object').columns:
    X_train = solution(X_train, i)
    X_test = solution(X_test, i)
    


#데이터 분할을 해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, stratify=y_train, random_state=1)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)


#수치형 컬럼 로그변환을 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
    if X_train[i].min()<0:
        X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i]))
    X_train[i] = np.log1p(X_train[i])    
    
    if X_validation[i].min()<0:
        X_validation[i] = X_validation[i].map(lambda x : x+abs(X_validation[i]))
    X_validation[i] = np.log1p(X_validation[i])
    
    if X_test[i].min()<0:
        X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i]))
    X_test[i] = np.log1p(X_test[i]) 

    
#수치형 컬럼 StandardScaler 스케일링 해주기
obj_col = X_train.select_dtypes(include='object').columns
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))

X_train_sc = sds.transform(X_train.drop(columns = obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns = obj_col).columns)

X_validation_sc = sds.transform(X_validation.drop(columns = obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns = obj_col).columns)

X_test_sc = sds.transform(X_test.drop(columns = obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns = obj_col).columns)


for i in obj_col:
    X_train_sc[i] = X_train[i]
    X_validation_sc[i] = X_validation[i]
    X_test_sc[i] = X_test[i]
    

#원핫 인코딩 해주자
#display(X_test_sc)
X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
X_full = pd.get_dummies(X_full)
X_train_sc = X_full[:len(X_train_sc)]
X_validation_sc = X_full[len(X_train_sc):len(X_train_sc)+len(X_validation_sc)]
X_test_sc = X_full[len(X_train_sc)+len(X_validation_sc):]
#display(X_test_sc)


#모델선정 (랜포 아니면 xgboost)
#평가 지표는 f1_score
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=2)
RFC.fit(X_train_sc, Y_train)

pred_train = RFC.predict(X_train_sc)
pred_validation = RFC.predict(X_validation_sc)

from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train)
f1_score_validation = f1_score(Y_validation, pred_validation)

print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)
print('\n')

import xgboost as xgb
xgb = xgb.XGBClassifier(random_state=3)
xgb.fit(X_train_sc, Y_train)

pred_train = xgb.predict(X_train_sc)
pred_validation = xgb.predict(X_validation_sc)

from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train)
f1_score_validation = f1_score(Y_validation, pred_validation)

print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)


#랜포가 더 잘나왔으므로 랜포로 최적의 매개변수를 찾아보자
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(random_state=2)
parameters = {
    'n_estimators' : [250,300,350],
    'max_depth' : [4,5,6,7]
}
gridshcv = GridSearchCV(model, parameters, cv = 10)
gridshcv.fit(X_train_sc, Y_train)
print('최적의 매개변수', gridshcv.best_params_)

f1_score_train 1.0
f1_score_validation 0.6880000000000002


f1_score_train 0.9653312788906009
f1_score_validation 0.6849651782056534
최적의 매개변수 {'max_depth': 6, 'n_estimators': 300}


#모델선정 (랜포 아니면 xgboost)
#평가 지표는 f1_score
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=2, max_depth=6, n_estimators=300)
RFC.fit(X_train_sc, Y_train)

pred_train = RFC.predict(X_train_sc)
pred_validation = RFC.predict(X_validation_sc)

from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train)
f1_score_validation = f1_score(Y_validation, pred_validation)

print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)
print('\n')

#최종적으로 test데이터 예측한 결과를 제출하라 , 평가지표는 f1_score
pred_test = RFC.predict(X_test_sc)

f1_score_test = f1_score(y_test, pred_test)
print('f1_score_test',f1_score_test)
pd.DataFrame({'ID':x_test['ID'], 'Reached.on.Time_Y.N':pred_test}).to_csv('20176516.csv', index=False)

f1_score_train 0.7357696566998893
f1_score_validation 0.680184331797235


f1_score_test 0.6694045174537987

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	NaN	2.31	0	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	NaN	7.07	0	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	NaN	7.07	0	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7
3	0.03237	NaN	2.18	0	0.458	6.998	45.8	6.0622	3	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0	0.458	NaN	54.2	6.0622	3	222.0	18.7	396.90	5.33	36.2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0	0.573	6.593	69.1	2.4786	1	273.0	21.0	391.99	9.67	22.4
502	0.04527	0.0	11.93	0	0.573	6.120	76.7	2.2875	1	273.0	21.0	396.90	9.08	20.6
503	0.06076	0.0	11.93	0	0.573	6.976	91.0	2.1675	1	273.0	21.0	396.90	5.64	23.9
504	0.10959	0.0	11.93	0	0.573	6.794	89.3	2.3889	1	273.0	21.0	393.45	6.48	22.0
505	0.04741	0.0	11.93	0	0.573	6.030	80.8	2.5050	1	273.0	21.0	396.90	7.88	11.9

	Warehouse_block	Mode_of_Shipment	Customer_care_calls	Customer_rating	Cost_of_the_Product	Prior_purchases	Product_importance	Gender	Discount_offered	Weight_in_gms
0	A	Flight	4	3	266	5	high	F	5	1590
1	F	Ship	3	1	174	2	low	M	44	1556
2	F	Road	4	1	154	10	high	M	10	5674
3	F	Ship	4	3	158	3	medium	F	27	1207
4	A	Flight	5	3	175	3	low	M	7	4833
...	...	...	...	...	...	...	...	...	...	...
6593	F	Road	5	2	221	6	medium	M	4	1952
6594	F	Ship	4	5	256	3	medium	M	10	4504
6595	F	Ship	3	1	217	4	medium	F	1	5761
6596	F	Road	4	5	174	3	medium	F	8	5576
6597	C	Ship	6	2	257	4	medium	M	1	1513

	ID	Reached.on.Time_Y.N
0	6045	0
1	44	1
2	7940	1
3	1596	1
4	4395	1

	ID	Reached.on.Time_Y.N
0	6811	1
1	4320	0
2	5732	0
3	7429	0
4	2191	1
...	...	...
4396	2610	1
4397	3406	0
4398	10395	0
4399	3646	0
4400	573	1

빅분기 모의고사 6회차

작업 1유형¶

주어진 Dataset에서 CRIM값이 가장 큰 10개의 지역을 구하고¶

10개의 지역의 CRIM값을 그 중 가장 작은 값으로 대체하라. 그리고¶

AGE 컬럼 값이 80이상인 대체 된 CRIM 평균값을 구하라¶

1-1에서 사용한 데이터에서 RM 중앙값으로 해당 컬럼의 결측치를 대체하라 그리고 해당 컬럼의 결측치 대치 전후의 표준편차 차이의 절댓값을 소숫점 이하 3째자리 까지 구하라¶

주어진 Dataset의 DIS 평균으로 부터 1.5 * 표준편차를 벗어나는 영역을 이상치라고 판단하고 DIS 컬럼의 이상치들의 합을 구하여라.¶

작업형 2유형¶

데이터 설명 : e-commerce 배송의 정시 도착여부 (1: 정시배송 0 : 정시미배송)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv ¶

데이터 출처 :https://www.kaggle.com/datasets/prachi13/customer-analytics (참고, 데이터 수정)¶

x_train 데이터로 학습한 모델을 x_test에 적용하여 예측한 결과를 제출하라. 평가 지표는 f1_score이다.¶

빅분기 모의고사 6회차

작업 1유형¶

주어진 Dataset에서 CRIM값이 가장 큰 10개의 지역을 구하고¶

10개의 지역의 CRIM값을 그 중 가장 작은 값으로 대체하라. 그리고¶

AGE 컬럼 값이 80이상인 대체 된 CRIM 평균값을 구하라¶

1-1에서 사용한 데이터에서 RM 중앙값으로 해당 컬럼의 결측치를 대체하라 그리고 해당 컬럼의 결측치 대치 전후의 표준편차 차이의 절댓값을 소숫점 이하 3째자리 까지 구하라¶

주어진 Dataset의 DIS 평균으로 부터 1.5 * 표준편차를 벗어나는 영역을 이상치라고 판단하고 DIS 컬럼의 이상치들의 합을 구하여라.¶

작업형 2유형¶

데이터 설명 : e-commerce 배송의 정시 도착여부 (1: 정시배송 0 : 정시미배송)¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv¶

데이터 출처 :https://www.kaggle.com/datasets/prachi13/customer-analytics (참고, 데이터 수정)¶

x_train 데이터로 학습한 모델을 x_test에 적용하여 예측한 결과를 제출하라. 평가 지표는 f1_score이다.¶

x_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv ¶

x_test: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv ¶

y_train: https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv ¶

y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_test.csv ¶