import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/train.csv')
df.head()


def solution(x):
    number = ''
    for i in x:
        if i.isdigit() == True:
            number+=i
    return int(number)
target = df.loc[df['gender']=='Male']
target['age'] = target['age'].apply(solution)
target['age'].mean()

C:\Users\Public\Documents\ESTsoft\CreatorTemp/ipykernel_26180/3407578701.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target['age'] = target['age'].apply(solution)

44.68623481781376


target = df['bmi'].fillna(df.loc[~df['bmi'].isnull()]['bmi'].median()).mean()
print(round(target,3))

29.166


target = df['bmi'].fillna(method='ffill').mean()
print(round(target,3))

29.188


def solution1(x):
    if 0<=x<=9:
        return 0
    elif 10<=x<=19:
        return 10
    elif 20<=x<=29:
        return 20
    elif 30<=x<=39:
        return 30
    elif 40<=x<=49:
        return 40
    elif 50<=x<=59:
        return 50
    elif 60<=x<=69:
        return 60
    elif 70<=x<=79:
        return 70
    elif 80<=x<=89:
        return 80

df['age'] = df['age'].apply(solution)
df['age_range'] = df['age'].apply(solution1)


df['avg_glucose_level'] = df['avg_glucose_level'].map(lambda x : 199 if x>=200 else x)
target = df.loc[df['stroke']==1]
print(round(target['avg_glucose_level'].mean(),3))

125.188


import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/pok/Pokemon.csv')
df


import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/pok/Pokemon.csv')

df = df.sort_values(by='Attack', ascending = False)
len_400 = df[:400]
a = len_400['Legendary'].value_counts().iloc[1]
len_400
len_401 = df[400:]
b = len_401['Legendary'].value_counts().iloc[1]
print(a-b)

57


target = df.groupby('Type 1').Total.mean().sort_values(ascending = False).index[2]
target

'Flying'


result = df.dropna()[:int(len(df.dropna()) *0.6)]
result


target = df.dropna()[:int(len(df.dropna())*0.6)].Defense.quantile(.25)
target

70.0


target = df.loc[df['Attack']>=df.loc[df['Type 1']=='Fire']['Attack'].mean()]
target = target.loc[target['Type 1']=='Water']
print(len(target))

37


target = df.groupby('Generation')[['Speed','Defense']].mean().T.diff().T.sort_values('Defense')
target.index[-1]

2


#데이터 불러오기
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/test.csv')
x_train = train.drop(columns = 'stroke')
x_test = test
y_train = train[['id','stroke']]

# id컬럼 삭제
X_train = x_train.drop(columns = 'id')
X_test = x_test.drop(columns = 'id')

display(X_train)
display(y_train.head())

y_train = y_train['stroke']

#데이터 확인
#print(X_train.info())
#print(X_train.nunique())
#print(X_train.isnull().sum())
#print(X_test.isnull().sum())

#bmi컬럼 널값이 있는 것을 확인!!
#널값 대체해주자(평균)
X_train['bmi'].fillna(X_train['bmi'].mean(), inplace=True)
X_test['bmi'].fillna(X_train['bmi'].mean(), inplace=True)

#train의 gender컬럼에 other데이터 삭제 시켜주자
X_train = X_train.loc[X_train['gender']!='Other']
y_train = y_train.loc[y_train.index.isin(X_train.index)]

#age컬럼 범주로 나눠주자
X_train['age'] = X_train['age'].map(lambda x : x.replace('*',''))
X_train['age'] = X_train['age'].astype('int')


#print(X_train['age'].min())
#print(X_train['age'].max())

def solution(x):
    if 0<=x<=9:
        return '0대'
    elif 10<=x<=19:
        return '10대'
    elif 20<=x<=29:
        return '20대'
    elif 30<=x<=39:
        return '30대'
    elif 40<=x<=49:
        return '40대'
    elif 50<=x<=59:
        return '50대'
    elif 60<=x<=69:
        return '60대'
    elif 70<=x<=79:
        return '70대'
    elif 80<=x<=89:
        return '80대'
    
X_train['age'] = X_train['age'].apply(solution)
X_test['age'] = X_test['age'].apply(solution)


# 수치형 컬럼이지만 범주형인 것들 object로 변경
for i in ['hypertension', 'heart_disease']:
    X_train[i] = X_train[i].astype('object')
    X_test[i] = X_test[i].astype('object')
    


#이상치는 평균으로 대체해주자
def eieiei(data, col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3 - q1
    
    upper = q3 + (1.5*iqr)
    lower = q1 - (1.5*iqr)
    data[col] = data[col].map(lambda x : data[col].mean() if (x>upper)or(x<lower) else x)
    return data

for i in X_train.select_dtypes(exclude='object').columns:
    X_train = eieiei(X_train,i)
    X_test = eieiei(X_test,i)
    
    
#데이터 분할해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=43, stratify=y_train)
#print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)

X_train.reset_index(drop=True, inplace = True)
X_validation.reset_index(drop=True, inplace = True)


#로그변환 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
    if X_train[i].min() < 0:
        X_train[i] = X_train[i].map(lambda x : x+abs(X_train.min()))
    X_train[i] = np.log1p(X_train[i])
    
    if X_validation[i].min() < 0:
        X_validation[i] = X_validation[i].map(lambda x : x+abs(X_validation.min()))
    X_validation[i] = np.log1p(X_validation[i])
    
    if X_test[i].min() < 0:
        X_test[i] = X_test[i].map(lambda x : x+abs(X_test.min()))
    X_test[i] = np.log1p(X_test[i])

    
#StandardScaler 스케일링 해주자 
obj_col = X_train.select_dtypes(include = 'object').columns
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))

X_train_sc = sds.transform(X_train.drop(columns = obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns = obj_col).columns)

X_validation_sc = sds.transform(X_validation.drop(columns = obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns = obj_col).columns)

X_test_sc = sds.transform(X_test.drop(columns = obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns = obj_col).columns)

for i in obj_col:
    X_train_sc[i] = X_train[i]
    X_validation_sc[i] = X_validation[i]
    X_test_sc[i] = X_test[i]

#원핫 인코딩 해주자
#display(X_test_sc)
X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
X_full = pd.get_dummies(X_full)

X_train_sc = X_full[:len(X_train_sc)]
X_validation_sc = X_full[len(X_train_sc):len(X_train_sc)+len(X_validation_sc)]
X_test_sc = X_full[len(X_train_sc)+len(X_validation_sc):]

#display(X_test_sc)


#xgb최적의 파라미터를 찾아보자
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
model = xgb.XGBClassifier(random_state=11)
#help(xgb.XGBClassifier)
parameters = {
    'n_estimators' : [20,40,60],
    'max_depth' : [1,2,3]
}
grid_search = GridSearchCV(model, parameters, cv=3)
grid_search.fit(X_train_sc, Y_train)

#결과 확인
print("최적의 파라미터", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

#xgb로 가자 더 높음
#xgboost로 돌려보자
import xgboost as xgb

xgb = xgb.XGBClassifier(random_state=11, max_depth=1, n_estimators=40)
xgb.fit(X_train_sc, Y_train)

pred_train = xgb.predict(X_train_sc)
pred_validation = xgb.predict(X_validation_sc)

pred_train_proba0 = xgb.predict_proba(X_train_sc)[:,0]
pred_train_proba1 = xgb.predict_proba(X_train_sc)[:,1]
pred_validation_proba0 = xgb.predict_proba(X_validation_sc)[:,0]
pred_validation_proba1 = xgb.predict_proba(X_validation_sc)[:,1]


#성능평가를 해보자
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

acc_train = accuracy_score(Y_train, pred_train)
acc_validation = accuracy_score(Y_validation, pred_validation)

pre_train = precision_score(Y_train, pred_train)
pre_validation = precision_score(Y_validation, pred_validation)

recall_train = recall_score(Y_train, pred_train)
recall_validation = recall_score(Y_validation, pred_validation)

f1_train = f1_score(Y_train, pred_train)
f1_validation = f1_score(Y_validation, pred_validation)

roc_auc_train = roc_auc_score(Y_train, pred_train_proba1)
roc_auc_validation = roc_auc_score(Y_validation, pred_validation_proba1)


print('acc_train',acc_train)
print('pre_train',pre_train)
print('recall_train',recall_train)
print('f1_train',f1_train)
print('roc_auc_train',roc_auc_train)
print('\n')
print('acc_validation',acc_validation)
print('pre_validation',pre_validation)
print('recall_validation',recall_validation)
print('f1_validation',f1_validation)
print('roc_auc_validation',roc_auc_validation)


#최종적으로test데이터를 예측후 예측 결과를 csv파일로 제출
pred_test = xgb.predict(X_test_sc)
pred_test_proba0 = xgb.predict_proba(X_test_sc)[:,0]
pred_test_proba1 = xgb.predict_proba(X_test_sc)[:,1]

pd.DataFrame({'id':x_test['id'], 'stroke':pred_test, 'proba0':pred_test_proba0, 'proba1': pred_test_proba1})

최적의 파라미터 {'max_depth': 1, 'n_estimators': 40}
Best Score: 0.877786768748768
acc_train 0.8764783180026281
pre_train 0.4166666666666667
recall_train 0.05434782608695652
f1_train 0.09615384615384615
roc_auc_train 0.8715555338922466


acc_validation 0.8773333333333333
pre_validation 0.4
recall_validation 0.044444444444444446
f1_validation 0.07999999999999999
roc_auc_validation 0.7968350168350168

	id	gender	age	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	1192	Female	31	No	Govt_job	Rural	70.66	27.2	never smoked	0
1	77	Female	13	No	children	Rural	85.81	18.6	Unknown	0
2	59200	Male	18	No	Private	Urban	60.56	33.0	never smoked	0
3	24905	Female	65	Yes	Private	Urban	205.77	46.0	formerly smoked	1
4	24257	Male	4	No	children	Rural	90.42	16.2	Unknown	0

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
795	719	Diancie	Rock	Fairy	600	50	100	150	100	150	50	6	True
796	719	DiancieMega Diancie	Rock	Fairy	700	50	160	110	160	110	110	6	True
797	720	HoopaHoopa Confined	Psychic	Ghost	600	80	110	60	150	130	70	6	True
798	720	HoopaHoopa Unbound	Psychic	Dark	680	80	160	60	170	130	80	6	True
799	721	Volcanion	Fire	Water	600	80	110	120	130	90	70	6	True

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
163	150	MewtwoMega Mewtwo X	Psychic	Fighting	780	106	190	100	154	100	130	1	True
232	214	HeracrossMega Heracross	Bug	Fighting	600	80	185	115	40	105	75	2	False
424	383	GroudonPrimal Groudon	Ground	Fire	770	100	180	160	150	90	90	3	True
426	384	RayquazaMega Rayquaza	Dragon	Flying	780	105	180	100	180	100	115	3	True
711	646	KyuremBlack Kyurem	Dragon	Ice	700	125	170	100	120	90	95	5	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...
714	647	KeldeoResolute Forme	Water	Fighting	580	91	72	90	129	90	108	5	False
94	87	Dewgong	Water	Ice	475	90	70	80	70	95	70	1	False
330	304	Aron	Steel	Rock	330	50	70	100	40	40	30	3	False
513	462	Magnezone	Electric	Steel	535	70	70	115	130	90	60	4	False
45	40	Wigglytuff	Normal	Fairy	435	140	70	45	85	50	45	1	False

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status
0	Female	31	0	0	No	Govt_job	Rural	70.66	27.2	never smoked
1	Female	13	0	0	No	children	Rural	85.81	18.6	Unknown
2	Male	18	0	0	No	Private	Urban	60.56	33.0	never smoked
3	Female	65	0	0	Yes	Private	Urban	205.77	46.0	formerly smoked
4	Male	4	0	0	No	children	Rural	90.42	16.2	Unknown
...	...	...	...	...	...	...	...	...	...	...
1132	Female	1	0	0	No	children	Rural	110.17	20.3	Unknown
1133	Male	6	0	0	No	children	Rural	92.98	18.9	Unknown
1134	Male	81	0	0	Yes	Private	Urban	234.35	25.3	formerly smoked
1135	Female	81	1	0	Yes	Private	Rural	80.43	29.7	never smoked
1136	Female	57	1	0	Yes	Private	Urban	108.61	38.1	smokes

	id	stroke
0	1192	0
1	77	0
2	59200	0
3	24905	1
4	24257	0

빅분기 모의고사 2회차

작업 1유형¶

1. 성별이 Male인 환자들의 age의 평균값은 ?¶

2. bmi컬럼의 결측치를 bmi컬럼의 결측치를 제외한 나머지 값들의 중앙값으로 채웠을 경우 bmi 컬럼의 평균을 소숫점 이하 3자리 까지 구하여라¶

3. bmi컬럼의 각 결측치들을 직전의 행의 bmi값으로 채웠을 경우 bmi컬럼의 평균을 소숫점 이하 3자리 까지 구하여라¶

4. bmi컬럼의 각 결측치들을 결측치를 가진 환자 나이대(10단위)의 평균 bmi 값으로 대체한 후 대체된 bmi 컬럼의 평균을 소숫점 이하 3자리 까지 구하여라¶

5. avg_glucose_level 컬럼의 값이 200이상인 데이터를 모두 199로 변경하고 stroke값이 1인 데이터의 avg_glucose_level값의 평균을 소수점이하 3자리 까지 구하여라¶

Attack컬럼의 값을 기준으로 내림차순정렬 했을때 상위 400위까지 포켓몬들과 401~800위까지의 포켓몬들에서 전설포켓몬(Legendary컬럼)의 숫자 차이는?¶

Type 1 컬럼의 종류에 따른 Total 컬럼의 평균값을 내림차순 정렬했을때 상위 3번째 Type 1은 무엇인가?¶

결측치가 존재하는 행을 모두 지운 후 처음부터 순서대로 60% 데이터를 추출하여 Defense컬럼의 1분위수를 구하여라¶

Type 1 컬럼의 속성이 Fire인 포켓몬들의 Attack의 평균이상인 Water속성의 포켓몬 수를 구하여라¶

각 세대 중(Generation 컬럼)의 Speed와 Defense 컬럼의 차이(절댓값)이 가장 큰 세대는?¶

작업 2유형¶

데이터 출처 : https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset (후처리 작업)¶

데이터 설명 : 뇌졸증 발생여부 예측¶

train : https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/train.csv ¶

test : https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/test.csv ¶

	id	stroke	proba0	proba1
0	47472	0	0.844078	0.155922
1	36841	0	0.846470	0.153530
2	3135	0	0.827112	0.172888
3	65218	0	0.965063	0.034937
4	1847	0	0.963504	0.036496
...	...	...	...	...
927	40311	0	0.936998	0.063002
928	44281	0	0.978329	0.021671
929	11238	0	0.936146	0.063854
930	17308	0	0.623784	0.376216
931	24782	0	0.966213	0.033787

빅분기 모의고사 2회차

작업 1유형¶

1. 성별이 Male인 환자들의 age의 평균값은 ?¶

2. bmi컬럼의 결측치를 bmi컬럼의 결측치를 제외한 나머지 값들의 중앙값으로 채웠을 경우 bmi 컬럼의 평균을 소숫점 이하 3자리 까지 구하여라¶

3. bmi컬럼의 각 결측치들을 직전의 행의 bmi값으로 채웠을 경우 bmi컬럼의 평균을 소숫점 이하 3자리 까지 구하여라¶

4. bmi컬럼의 각 결측치들을 결측치를 가진 환자 나이대(10단위)의 평균 bmi 값으로 대체한 후 대체된 bmi 컬럼의 평균을 소숫점 이하 3자리 까지 구하여라¶

5. avg_glucose_level 컬럼의 값이 200이상인 데이터를 모두 199로 변경하고 stroke값이 1인 데이터의 avg_glucose_level값의 평균을 소수점이하 3자리 까지 구하여라¶

Attack컬럼의 값을 기준으로 내림차순정렬 했을때 상위 400위까지 포켓몬들과 401~800위까지의 포켓몬들에서 전설포켓몬(Legendary컬럼)의 숫자 차이는?¶

Type 1 컬럼의 종류에 따른 Total 컬럼의 평균값을 내림차순 정렬했을때 상위 3번째 Type 1은 무엇인가?¶

결측치가 존재하는 행을 모두 지운 후 처음부터 순서대로 60% 데이터를 추출하여 Defense컬럼의 1분위수를 구하여라¶

Type 1 컬럼의 속성이 Fire인 포켓몬들의 Attack의 평균이상인 Water속성의 포켓몬 수를 구하여라¶

각 세대 중(Generation 컬럼)의 Speed와 Defense 컬럼의 차이(절댓값)이 가장 큰 세대는?¶

작업 2유형¶

데이터 출처 : https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset (후처리 작업)¶

데이터 설명 : 뇌졸증 발생여부 예측¶

train : https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/train.csv¶

test : https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/test.csv¶

train : https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/train.csv ¶

test : https://raw.githubusercontent.com/Datamanim/datarepo/main/stroke_/test.csv ¶