import pandas as pd
import numpy as np
import warnings

#모든 경고 메시지를 무시하도록 설정하는 코드
warnings.filterwarnings('ignore')

body = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/bodyPerformance.csv')
body


body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
dtypes: float64(10), object(2)
memory usage: 1.2+ MB


body['gender'] = np.where(body['gender']=='M',0,1)
body['class'] = np.where(body['class']=='A',0,1)
body


x = body.drop(columns='class') #독립변수
y = body['class'] #종속변수

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,stratify=y, test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(9375, 11) (4018, 11) (9375,) (4018,)


from sklearn.linear_model import LogisticRegression
logR = LogisticRegression(random_state=45)
logR.fit(X_train,y_train)

#predict_proba() 메소드는 
#주어진 데이터의 각 클래스(레이블)에 속할 확률을 예측합니다.
proba = pd.DataFrame(logR.predict_proba(X_train))
#decision_function 메소드는 
#로지스틱 회귀 분석에서 예측한 결정 함수값을 반환하는 메소드입니다.
cs = pd.DataFrame(logR.decision_function(X_train))
#display(proba)
#display(cs)

df = pd.concat([proba,cs], axis=1)
df.columns = ['A가 아닌 확률', 'A일 확률','결정함수값']
df.sort_values(by='결정함수값', inplace=True)
df.reset_index(drop=True, inplace=True)
df


import matplotlib.pyplot as plt
plt.figure(figsize=(15,5))

plt.axhline(y=0.5, linestyle='--', color='black', linewidth=1)
plt.axvline(x=0 , linestyle='--', color='black', linewidth=1)

plt.plot(df['결정함수값'], df['A가 아닌 확률'], 'g--', label='A가 아닌 확률')
plt.plot(df['결정함수값'], df['A가 아닌 확률'], 'g^')

plt.plot(df['결정함수값'], df['A일 확률'], 'g--', label='A일 확률')
plt.plot(df['결정함수값'], df['A일 확률'], 'g^')

plt.xlabel
plt.ylabel

plt.legend(loc='upper left')
plt.show()


pred = logR.predict(X_test) #예측값 반환

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)
test_prc = precision_score(y_test, pred)
test_rcll = recall_score(y_test, pred)
test_f1 =  f1_score(y_test, pred)

print(test_cm) #혼동행렬
print('\n')
print('정확도\t{}%'.format(round(test_acc*100,2)))
print('정밀도\t{}%'.format(round(test_prc*100,2)))
print('재현율\t{}%'.format(round(test_rcll*100,2)))
print('f1스코어\t{}%'.format(round(test_f1*100,2)))

#ROC커브와 함꼐 AUC를 시각화
import pandas as pd
from sklearn.metrics import RocCurveDisplay

#사이킷런 분류기의 ROC 곡선을 시각화하는 메서드입니다.
RocCurveDisplay.from_estimator(logR, X_test, y_test)
plt.show()

[[ 607  397]
 [ 237 2777]]


정확도	84.22%
정밀도	87.49%
재현율	92.14%
f1스코어	89.75%


import pandas as pd
import numpy as np
import warnings

#모든 경고 메시지를 무시하도록 설정하는 코드
warnings.filterwarnings('ignore')

body = pd.read_csv('https://raw.githubusercontent.com/ADPclass/ADP_book_ver01/main/data/bodyPerformance.csv')
body


body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
dtypes: float64(10), object(2)
memory usage: 1.2+ MB


# gender와 class 변수 전처리
body['gender'] = np.where(body['gender']=='M',0,1)

class_dict = {'A':0, 'B':1, 'C':2, 'D':4}
body['class'] = body['class'].map(class_dict)
body


x = body.drop(columns='class')
y = body['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,stratify=y, test_size=0.3, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(9375, 11) (4018, 11) (9375,) (4018,)


from sklearn.linear_model import LogisticRegression

softm = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10, random_state=45)
softm.fit(X_train, y_train)

LogisticRegression(C=10, multi_class='multinomial', random_state=45)

LogisticRegression(C=10, multi_class='multinomial', random_state=45)


pred = softm.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

test_cm = confusion_matrix(y_test, pred)
test_acc = accuracy_score(y_test, pred)


print(test_cm) #혼동행렬
print('\n')
print('정확도\t{}%'.format(round(test_acc*100,2)))

[[707 261  36   0]
 [269 403 300  32]
 [ 92 207 525 181]
 [ 13  63 157 772]]


정확도	59.91%

	age	gender	height_cm	weight_kg	body fat_%	diastolic	systolic	gripForce	sit and bend forward_cm	sit-ups counts	broad jump_cm	class
0	27.0	M	172.3	75.24	21.3	80.0	130.0	54.9	18.4	60.0	217.0	C
1	25.0	M	165.0	55.80	15.7	77.0	126.0	36.4	16.3	53.0	229.0	A
2	31.0	M	179.6	78.00	20.1	92.0	152.0	44.8	12.0	49.0	181.0	C
3	32.0	M	174.5	71.10	18.4	76.0	147.0	41.4	15.2	53.0	219.0	B
4	28.0	M	173.8	67.70	17.1	70.0	127.0	43.5	27.1	45.0	217.0	B
...	...	...	...	...	...	...	...	...	...	...	...	...
13388	25.0	M	172.1	71.80	16.2	74.0	141.0	35.8	17.4	47.0	198.0	C
13389	21.0	M	179.7	63.90	12.1	74.0	128.0	33.0	1.1	48.0	167.0	D
13390	39.0	M	177.2	80.50	20.1	78.0	132.0	63.5	16.4	45.0	229.0	A
13391	64.0	F	146.1	57.70	40.4	68.0	121.0	19.3	9.2	0.0	75.0	D
13392	34.0	M	164.0	66.10	19.5	82.0	150.0	35.9	7.1	51.0	180.0	C

	age	gender	height_cm	weight_kg	body fat_%	diastolic	systolic	gripForce	sit and bend forward_cm	sit-ups counts	broad jump_cm	class
0	27.0	0	172.3	75.24	21.3	80.0	130.0	54.9	18.4	60.0	217.0	1
1	25.0	0	165.0	55.80	15.7	77.0	126.0	36.4	16.3	53.0	229.0	0
2	31.0	0	179.6	78.00	20.1	92.0	152.0	44.8	12.0	49.0	181.0	1
3	32.0	0	174.5	71.10	18.4	76.0	147.0	41.4	15.2	53.0	219.0	1
4	28.0	0	173.8	67.70	17.1	70.0	127.0	43.5	27.1	45.0	217.0	1
...	...	...	...	...	...	...	...	...	...	...	...	...
13388	25.0	0	172.1	71.80	16.2	74.0	141.0	35.8	17.4	47.0	198.0	1
13389	21.0	0	179.7	63.90	12.1	74.0	128.0	33.0	1.1	48.0	167.0	1
13390	39.0	0	177.2	80.50	20.1	78.0	132.0	63.5	16.4	45.0	229.0	0
13391	64.0	1	146.1	57.70	40.4	68.0	121.0	19.3	9.2	0.0	75.0	1
13392	34.0	0	164.0	66.10	19.5	82.0	150.0	35.9	7.1	51.0	180.0	1

	A가 아닌 확률	A일 확률	결정함수값
0	1.000000e+00	2.919260e-15	-33.467446
1	9.981420e-01	1.858024e-03	-6.286382
2	9.979469e-01	2.053080e-03	-6.186359
3	9.970198e-01	2.980156e-03	-5.812795
4	9.970177e-01	2.982331e-03	-5.812063
...	...	...	...
9370	2.147840e-07	9.999998e-01	15.353633
9371	1.787746e-07	9.999998e-01	15.537140
9372	1.327906e-07	9.999999e-01	15.834492
9373	9.559042e-08	9.999999e-01	16.163193
9374	8.725606e-08	9.999999e-01	16.254419

	age	gender	height_cm	weight_kg	body fat_%	diastolic	systolic	gripForce	sit and bend forward_cm	sit-ups counts	broad jump_cm	class
0	27.0	M	172.3	75.24	21.3	80.0	130.0	54.9	18.4	60.0	217.0	C
1	25.0	M	165.0	55.80	15.7	77.0	126.0	36.4	16.3	53.0	229.0	A
2	31.0	M	179.6	78.00	20.1	92.0	152.0	44.8	12.0	49.0	181.0	C
3	32.0	M	174.5	71.10	18.4	76.0	147.0	41.4	15.2	53.0	219.0	B
4	28.0	M	173.8	67.70	17.1	70.0	127.0	43.5	27.1	45.0	217.0	B
...	...	...	...	...	...	...	...	...	...	...	...	...
13388	25.0	M	172.1	71.80	16.2	74.0	141.0	35.8	17.4	47.0	198.0	C
13389	21.0	M	179.7	63.90	12.1	74.0	128.0	33.0	1.1	48.0	167.0	D
13390	39.0	M	177.2	80.50	20.1	78.0	132.0	63.5	16.4	45.0	229.0	A
13391	64.0	F	146.1	57.70	40.4	68.0	121.0	19.3	9.2	0.0	75.0	D
13392	34.0	M	164.0	66.10	19.5	82.0	150.0	35.9	7.1	51.0	180.0	C

	age	gender	height_cm	weight_kg	body fat_%	diastolic	systolic	gripForce	sit and bend forward_cm	sit-ups counts	broad jump_cm	class
0	27.0	0	172.3	75.24	21.3	80.0	130.0	54.9	18.4	60.0	217.0	2
1	25.0	0	165.0	55.80	15.7	77.0	126.0	36.4	16.3	53.0	229.0	0
2	31.0	0	179.6	78.00	20.1	92.0	152.0	44.8	12.0	49.0	181.0	2
3	32.0	0	174.5	71.10	18.4	76.0	147.0	41.4	15.2	53.0	219.0	1
4	28.0	0	173.8	67.70	17.1	70.0	127.0	43.5	27.1	45.0	217.0	1
...	...	...	...	...	...	...	...	...	...	...	...	...
13388	25.0	0	172.1	71.80	16.2	74.0	141.0	35.8	17.4	47.0	198.0	2
13389	21.0	0	179.7	63.90	12.1	74.0	128.0	33.0	1.1	48.0	167.0	4
13390	39.0	0	177.2	80.50	20.1	78.0	132.0	63.5	16.4	45.0	229.0	0
13391	64.0	1	146.1	57.70	40.4	68.0	121.0	19.3	9.2	0.0	75.0	4
13392	34.0	0	164.0	66.10	19.5	82.0	150.0	35.9	7.1	51.0	180.0	2

빅분기6장(로지스틱 회귀)

6.4 로지스틱 회귀¶

종속변수가 범주형인 경우 적용하는 회귀분석(y가 특정 범주에 속하는 확률을 모델링)¶

코드구현¶

np.where를 사용해 binary로 바꿔준다. gender(남자 : 0, 여자 : 1) class(A : 0, A가아닌경우 : 1)¶

독립변수x, 종속변수y를 나눈뒤 sklearn.model_selection의 train_test_split으로 7:3 비율로 나눔¶

LogisticRegression을 import한 뒤 모델에 데이터를 학습¶

Confidence Scores에 따른 클래스 확률값을 매칭시키면 A클래스에 속할 추정확률과 결정경계를 얻을 수 있다.¶

테스트셋에 대해 분류 모델의 성능평가를 수행해보자. predict메서드로 예측값 반한한뒤 y_test, pred를 입력받아 혼동행렬로 반환(정확도, 정밀도, 재현율, F1스코어, AUC값을 반환¶

6.4(2) 다중 클래스 분류 - 소프트맥스 회귀¶

클래스의 총 개수를 k개라 하면 k차원의 벡터를 입력받아 각 클래스에 대한 점수를 계산하고, 그 점수에 소프트맥스 함수를 적용해 각 클래스의 확률을 추정한다¶

sklearn의 LogisticRegression의 multi_class 매개변수를 multinomial로 바꾸면 소프트 맥스 회귀를 사용할 수 있다.¶

solver 매개변수에 소프트맥스 회귀를 적용할 수 있는 lbfgs와 같은 알고리즘을 설정한다.¶

ex) 범주형 종속변수가 0,1이 아닌 0,1,2,3 과같은 범주일 경우¶

train_test_split로 학습데이터와 평가데이터를 분리¶

LogisticRegression에서 소프트맥스 회귀를 수행하기 위한 매개변수를 설정한뒤 fit메서드로 데이터를 모델에 학습시킨다¶

C는 규제의 강도를 조절하는 하이퍼파라미터이며¶

solver는 최적화 알고리즘을 지정하는 파라미터입니다.¶

lbfgs는 작은 데이터셋에서 잘 동작하며 다중 클래스 분류 문제에도 적합한 알고리즘입니다.¶

random_state는 결과를 재현하기 위한 시드(seed) 값입니다.¶

multi_class는 다중 클래스 분류 문제에서 어떤 방법을 사용할지를 지정하는 파라미터로, 'multinomial'을 지정하면 cross-entropy 손실 함수를 최소화하여 다중 클래스 분류 문제를 해결합니다.¶

predict 메서드로 예측후 혼동행렬과 분류모델 성능평가 ㄱㄱ¶