In [12]:
# 데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p1_1.csv')
df.head()
Out[12]:
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 05-02-2010 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 |
| 1 | 1 | 12-02-2010 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 |
| 2 | 1 | 19-02-2010 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 |
| 3 | 1 | 26-02-2010 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 |
| 4 | 1 | 05-03-2010 | 1554806.68 | 0 | 46.5 | 2.625 | 211.350143 | 8.106 |
Temperature컬럼에서 숫자가 아닌 문자들을 제거후 숫자 타입으로 바꾸고 3분위수에서 1분위수의 차이를 소숫점 이하 2자리까지 구하여라¶
In [13]:
df['Temperature'] = df['Temperature'].map(lambda x : x.replace('*',''))
df['Temperature'] = df['Temperature'].astype('float')
q3 = df['Temperature'].quantile(0.75)
q1 = df['Temperature'].quantile(0.25)
iqr = q3-q1
print(round(iqr,2))
27.48
In [15]:
# 데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p1_2.csv')
df.head()
Out[15]:
| Title | Video ID | Published At | Keyword | Likes | Comments | Views | |
|---|---|---|---|---|---|---|---|
| 0 | Apple Pay Is Killing the Physical Wallet After... | wAZZ-UWGVHI | 2022-08-23 | tech | 3407.0 | 672.0 | 135612.0 |
| 1 | The most EXPENSIVE thing I own. | b3x28s61q3c | 2022-08-24 | tech | 76779.0 | 4306.0 | 1758063.0 |
| 2 | My New House Gaming Setup is SICK! | 4mgePWWCAmA | 2022-08-23 | tech | 63825.0 | 3338.0 | 1564007.0 |
| 3 | Petrol Vs Liquid Nitrogen | Freezing Experimen... | kXiYSI7H2b0 | 2022-08-23 | tech | 71566.0 | 1426.0 | 922918.0 |
| 4 | Best Back to School Tech 2022! | ErMwWXQxHp0 | 2022-08-08 | tech | 96513.0 | 5155.0 | 1855644.0 |
Likes를 Comments로 나눈 비율이 20이상이면서 Keyword값이 minecraft인 영상들의 Views값의 평균을 정수로 구하여라¶
In [19]:
target = df.loc[((df['Likes']/df['Comments'])>=20) & (df['Keyword']=='minecraft')]
print(int(target['Views'].mean()))
1789084
In [44]:
# 데이터 불러오기
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p1_3.csv')
df.head()
Out[44]:
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
date_added가 2018년 1월 이면서 country가 United Kingdom 단독 제작인 데이터의 갯수¶
In [53]:
df.dropna(subset = ['date_added'], axis=0, inplace=True)
target = df.loc[(df['date_added'].str.contains('2018')) & ((df['date_added'].str.contains('January')))]
target = target.loc[target['country'] == 'United Kingdom']
print(len(target))
6
작업형 2유형¶
유저 분류 : https://www.kaggle.com/datasets/kaushiksuresh147/customer-segmentation¶
train = https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p2_train.csv¶
test = https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p2_test.csv¶
예측 변수 Segmentation, test.csv에 대해 ID별로 Segmentation의 클래스를 예측해서 저장후 제출, 제출 데이터 컬럼은 ID와 Segmentation 두개만 존재해야함. 평가지표는 macro f1 score¶
In [70]:
# 데이터 불러오기
import pandas as pd
x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p2_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p2_test.csv')
#id컬럼 제거
X_train = x_train.drop(columns = ['ID','Segmentation'])
X_test = x_test.drop(columns = 'ID')
y_train = x_train[['ID','Segmentation']]
#타겟 0,1,2,3 으로 바꿔주기
def target(x):
if x=='A':
return 0
elif x=='B':
return 1
elif x=='C':
return 2
elif x=='D':
return 3
y_train['Segmentation'] = y_train['Segmentation'].apply(target)
display(X_train)
display(y_train.head())
y_train = y_train['Segmentation']
#print(X_train.info())
#print(X_train.nunique())
#수치형 컬럼 결측값은 평균으로 대체
#범주형 컬럼은 범주가 젤많은것으로 대체
fillna_Ever_Married = X_train['Ever_Married'].value_counts().index[0]
fillna_Graduated = X_train['Graduated'].value_counts().index[0]
fillna_Profession = X_train['Profession'].value_counts().index[0]
fillna_Work_Experience = X_train['Work_Experience'].mean()
fillna_Family_Size = X_train['Family_Size'].value_counts().index[0]
fillna_Var_1 = X_train['Var_1'].value_counts().index[0]
fillna_dic = {
'Ever_Married' : fillna_Ever_Married,
'Graduated' : fillna_Graduated,
'Profession' : fillna_Profession,
'Work_Experience' : fillna_Work_Experience,
'Family_Size' : fillna_Family_Size,
'Var_1' : fillna_Var_1
}
for i in fillna_dic.keys():
X_train[i] = X_train[i].fillna(fillna_dic[i])
X_test[i] = X_test[i].fillna(fillna_dic[i])
# Family_Size 범주형 변경
X_train['Family_Size'] = X_train['Family_Size'].astype('str')
X_test['Family_Size'] = X_test['Family_Size'].astype('str')
#print(X_train.isnull().sum())
#print(X_test.isnull().sum())
#수치형 컬럼 이상치를 대체해주자(평균)
def eieiei(data, col):
q3 = data[col].quantile(0.75)
q1 = data[col].quantile(0.25)
iqr = q3 - q1
upper = q3 + (1.5*iqr)
lower = q1 - (1.5*iqr)
data[col] = data[col].map(lambda x : data[col].mean() if (x>upper) or (x<lower) else x)
return data
for i in X_train.select_dtypes(exclude='object').columns:
X_train = eieiei(X_train, i)
X_test = eieiei(X_test, i)
#데이터 분할을 해주자
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, y_train, test_size = 0.33, random_state=43)
print(X_train.shape, X_validation.shape, Y_train.shape, Y_validation.shape)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
#수치형 컬럼 로그변환을 해주자
import numpy as np
for i in X_train.select_dtypes(exclude='object').columns:
if X_train[i].min() < 0:
X_train[i] = X_train[i].map(lambda x : x+abs(X_train[i].min()))
X_train[i] = np.log1p(X_train[i])
if X_validation[i].min() < 0:
X_validation[i] = X_validation[i].map(lambda x : x+abs(X_validation[i].min()))
X_validation[i] = np.log1p(X_validation[i])
if X_test[i].min() < 0:
X_test[i] = X_test[i].map(lambda x : x+abs(X_test[i].min()))
X_test[i] = np.log1p(X_test[i])
#수치형 컬럼 StandardScaler 스케일링 해주기
obj_col = X_train.select_dtypes(include='object').columns
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))
X_train_sc = sds.transform(X_train.drop(columns = obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns = obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns = obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns = obj_col).columns)
X_test_sc = sds.transform(X_test.drop(columns = obj_col))
X_test_sc = pd.DataFrame(X_test_sc, columns = X_test.drop(columns = obj_col).columns)
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
X_test_sc[i] = X_test[i]
#원핫 인코딩 해주자
#display(X_test_sc)
X_full = pd.concat([X_train_sc, X_validation_sc, X_test_sc])
X_full = pd.get_dummies(X_full)
X_train_sc = X_full[:len(X_train_sc)]
X_validation_sc = X_full[len(X_train_sc):len(X_train_sc)+len(X_validation)]
X_test_sc = X_full[len(X_train_sc)+len(X_validation):]
#display(X_test_sc)
#모델링 비교 (랜포 vs xgboost)
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
RFC = RandomForestClassifier(random_state=10)
RFC.fit(X_train_sc, Y_train)
xgb = xgb.XGBClassifier(random_state = 11)
xgb.fit(X_train_sc, Y_train)
pred_xg_train = xgb.predict(X_train_sc)
pred_xg_validation = xgb.predict(X_validation_sc)
pred_rf_train = RFC.predict(X_train_sc)
pred_rf_validation = RFC.predict(X_validation_sc)
#모델 성능펼가
from sklearn.metrics import f1_score
f1_score_rf_train = f1_score(Y_train, pred_rf_train, average = 'macro')
f1_score_rf_validation = f1_score(Y_validation, pred_rf_validation, average = 'macro')
print('f1_score_rf_train',f1_score_rf_train)
print('f1_score_rf_validation',f1_score_rf_validation)
f1_score_xg_train = f1_score(Y_train, pred_xg_train, average='macro')
f1_score_xg_validation= f1_score(Y_validation, pred_xg_validation, average='macro')
print('f1_score_xg_train', f1_score_xg_train)
print('f1_score_xg_validation', f1_score_xg_validation)
C:\Users\Public\Documents\ESTsoft\CreatorTemp/ipykernel_12868/3248605245.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy y_train['Segmentation'] = y_train['Segmentation'].apply(target)
| Gender | Ever_Married | Age | Graduated | Profession | Work_Experience | Spending_Score | Family_Size | Var_1 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | No | 40 | Yes | Artist | 7.0 | Low | 1.0 | Cat_6 |
| 1 | Male | No | 18 | No | Healthcare | NaN | Low | 5.0 | Cat_4 |
| 2 | Male | No | 25 | No | Healthcare | 7.0 | Low | 4.0 | Cat_4 |
| 3 | Female | Yes | 46 | Yes | Artist | 2.0 | Low | 1.0 | Cat_6 |
| 4 | Male | No | 27 | Yes | Healthcare | 1.0 | Low | 3.0 | Cat_6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4876 | Male | Yes | 84 | Yes | Lawyer | NaN | Low | 2.0 | Cat_2 |
| 4877 | Male | No | 84 | No | Artist | 0.0 | Low | 1.0 | Cat_6 |
| 4878 | Female | Yes | 40 | No | Engineer | 0.0 | Low | 3.0 | Cat_4 |
| 4879 | Male | Yes | 36 | Yes | Entertainment | 9.0 | Low | 4.0 | Cat_6 |
| 4880 | Male | Yes | 52 | No | Executive | 1.0 | Low | 5.0 | NaN |
4881 rows × 9 columns
| ID | Segmentation | |
|---|---|---|
| 0 | 464357 | 0 |
| 1 | 459624 | 3 |
| 2 | 462672 | 3 |
| 3 | 463360 | 1 |
| 4 | 462420 | 2 |
(3270, 9) (1611, 9) (3270,) (1611,) f1_score_rf_train 0.9732843697357679 f1_score_rf_validation 0.40362107509255996 f1_score_xg_train 0.8371497021628412 f1_score_xg_validation 0.4310435915379764
In [84]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
model = xgb.XGBClassifier(random_state = 11)
parameters = {
'n_estimators' : [100,200,300],
'max_depth' : [1,2,3]
}
gridshcv = GridSearchCV(model, parameters, cv=3)
gridshcv.fit(X_train_sc, Y_train)
print('최적의 매개변수:', gridshcv.best_params_)
최적의 매개변수: {'max_depth': 2, 'n_estimators': 100}
In [93]:
import xgboost as xgb
xgb = xgb.XGBClassifier(random_state=11, max_depth = 2, n_estimators=100)
xgb.fit(X_train_sc, Y_train)
pred_train = xgb.predict(X_train_sc)
pred_validation = xgb.predict(X_validation_sc)
pred_test = xgb.predict(X_test_sc)
from sklearn.metrics import f1_score
f1_score_train = f1_score(Y_train, pred_train, average='macro')
f1_score_validation = f1_score(Y_validation, pred_validation, average='macro')
print('f1_score_train',f1_score_train)
print('f1_score_validation',f1_score_validation)
#pd.DataFrame({'ID':x_test['ID'], 'Segmentation':pred_test}).to_csc('201716551', index=False)
f1_score_train 0.5210729096324587 f1_score_validation 0.4674311325163576
Out[93]:
| ID | Segmentation | |
|---|---|---|
| 0 | 460406 | 3 |
| 1 | 466890 | 2 |
| 2 | 466145 | 0 |
| 3 | 465805 | 1 |
| 4 | 466137 | 3 |
| ... | ... | ... |
| 1537 | 463734 | 0 |
| 1538 | 466055 | 1 |
| 1539 | 465689 | 3 |
| 1540 | 460401 | 3 |
| 1541 | 465215 | 1 |
1542 rows × 2 columns
작업형 3유형¶
어느 대학교의 신입생의 학과별 성별에 대한 데이터이다. DataUrl = https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p3_1_.csv¶
이 데이터를 바탕으로, 학생들의 학과와 성별이 서로 독립적인지 여부를 확인하기 위해 카이제곱 독립성 검정을 실시 하려한다.¶
In [94]:
#데이터 불러오기
import pandas as pd
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e4_p3_1_.csv')
df.head()
Out[94]:
| 학번 | 학과 | 성별 | |
|---|---|---|---|
| 0 | DFSKJK_1 | 경영학과 | 여 |
| 1 | DFSKJK_2 | 사회학과 | 여 |
| 2 | DFSKJK_3 | 기계공학과 | 여 |
| 3 | DFSKJK_4 | 경제학과 | 남 |
| 4 | DFSKJK_5 | 기계공학과 | 여 |
학과 평균 인원에 대한 값을 소숫점 이하 3자리까지 구하여라¶
In [98]:
target = df.groupby('학과').size()
print(round(target.mean(),3))
170.333
카이제곱검정 독립성 검정 통계량을 소숫점 이하 3자리까지 구하여라¶
In [110]:
from scipy.stats import chi2_contingency
crosstab = pd.crosstab(df['학과'], df['성별'])
crosstab
chi2_contingency(crosstab)
print(round(chi2_contingency(crosstab)[0],3))
5.646
카이제곱검정 독립성 검정의 pvalue를 소숫점 이하 3자리까지 구하여라. 유의수준 0.05하에서 귀무가설과 대립가설중 유의한 것을 출력하라¶
In [114]:
print(round(chi2_contingency(crosstab)[1],3))
print('pvalue가 0.05보다 크므로 귀무가설 채택')
0.342 pvalue가 0.05보다 크므로 귀무가설 채택
'빅데이터분석기사 준비' 카테고리의 다른 글
| 빅분기 모의고사 6회차 (0) | 2023.06.22 |
|---|---|
| 빅분기 모의고사 5회차 (0) | 2023.06.21 |
| 빅분기 모의고사 3회차 (0) | 2023.06.19 |
| 빅분기 모의고사 2회차 (0) | 2023.06.16 |
| 빅분기 모의고사 1회차 (0) | 2023.06.14 |