던파 쿠버플로우, 쿠버네티스(2)
rm -rf /var/lib/docker/
이제 던파 아바타 가격 예측을 수행해보자
장고에서 select 선택해 input을 할 목록을 테이블에 저장해두자
1) 레어 아바타 select 선택자 테이블 생성
CREATE TABLE input_list (
title TEXT,
jobname TEXT,
emblem TEXT
);
2) 상급 아바타 select 선택자 테이블 생성
CREATE TABLE input_list1 (
title TEXT,
jobname TEXT,
emblem TEXT
);
장고에서 아바타 정보 입력하면 오늘의 예상 가격을 뽑아낼거임!!
그러기 위해선 스케일 객체, 인코딩 객체, 모델 객체를 가져와야되는데 pv에 저장해둘거임 그러기 위해 pvc생성해두자
vim scaler-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
namespace: sy #네임스페이스 설정해줘야됨
name: scaler-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
vim encoder-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
namespace: sy #네임스페이스 설정해줘야됨
name: encoder-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
vim model-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
namespace: sy #네임스페이스 지정 해줘야됨
name: model-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
kubectl create -f 명령어로 생성해주자
던파 레어, 상급 아바타 장고 select 목록 테이블에 넣어주고 카팁 돌리기 위해 검증데이터 mae 출력하는 이미지 만들어 보자
레어 아바타 이미지
dockerfile
# base image
FROM python:3.9
# 작업 디렉토리 설정
WORKDIR /app
# main 파일 작업디렉토리로 복사
COPY main.py /app/main.py
COPY requirements.txt /app
# 필요한 종속성 설치
RUN pip install --no-cache-dir -r requirements.txt
CMD [ "python" , "main.py" ]
main.py
import pandas as pd
import pymysql
from pymongo import MongoClient
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
# 연결 정보 설정
host = "10.233.18.183"
port = 3306
database = "donpa_item"
user = "root"
password = "1234"
# 연결
connection = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database
)
# SQL 쿼리
query = "SELECT * FROM goldprice"
# 데이터프레임으로 변환
date_df = pd.read_sql(query, connection)
# 연결 종료
connection.close
# MongoDB 연결 설정
client = MongoClient("mongodb://3.38.178.84:27017/")
db = client["donpa"] # 데이터베이스명
collection = db["donpa_aabata_rare"] # 컬렉션명
# 컬렉션 데이터 조회
data = collection.find()
# 데이터프레임으로 변환
df = pd.DataFrame(data)
# _id 컬럼은 제거해주자
df = df.drop(columns="_id")
df['price'] = df['price'].astype('float')
df['soldDate'] = df['soldDate'].str.replace('-','')
# sell, buy컬럼 붙여주자!!
df = df.merge(date_df, left_on='soldDate', right_on='date', how='left')
# sell, buy값이 널값이면 가장 최근의 sell, buy값으로 대체해주자
null_check = df['sell'].isnull().sum()
if null_check != 0:
# date_df의 맨 마지막 값을 가져와서 NaN 값을 채우기
last_date_df_row = date_df.iloc[-1]
df['sell'].fillna(last_date_df_row['sell'], inplace=True)
df['buy'].fillna(last_date_df_row['buy'], inplace=True)
# date컬럼 삭제해주자
df.drop(columns = 'date', inplace=True)
df['soldDate'] = pd.to_datetime(df['soldDate'])
# 년 컬럼 추가
df['year'] = df['soldDate'].dt.year
# 월 컬럼 추가
df['month'] = df['soldDate'].dt.month
# 일 컬럼 추가
df['day'] = df['soldDate'].dt.day
# 요일 컬럼 추가해주자
df['day_name'] = df['soldDate'].dt.day_name()
# 필요 없는 컬럼 삭제
df.drop(columns = 'soldDate', inplace=True)
# ava_rit도 어차피다 레어이니까 삭제
df.drop(columns = 'ava_rit', inplace=True)
# 타겟 컬러 맨뒤로 보내기
price_column = df.pop('price')
df['price'] = price_column
title_data = df['title'].drop_duplicates()
jobname_data = df['jobname'].drop_duplicates()
emblem_data = df['emblem'].drop_duplicates()
# 장고 select 목록 테이블에 넣어주는 코드
import pymysql
# 연결 정보 설정
host = "10.233.18.183"
port = 3306
database = "donpa_item"
user = "root"
password = "1234"
# 연결
connection = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database
)
try:
# 커서 생성
cursor = connection.cursor()
# 데이터 입력 (존재하지 않는 데이터만 입력)
for item in title_data:
sql = f"INSERT INTO input_list (title) SELECT '{item}' FROM DUAL WHERE NOT EXISTS (SELECT * FROM input_list WHERE title = '{item}')"
cursor.execute(sql)
for item in jobname_data:
sql = f"INSERT INTO input_list (jobname) SELECT '{item}' FROM DUAL WHERE NOT EXISTS (SELECT * FROM input_list WHERE jobname = '{item}')"
cursor.execute(sql)
for item in emblem_data:
sql = f"INSERT INTO input_list (emblem) SELECT '{item}' FROM DUAL WHERE NOT EXISTS (SELECT * FROM input_list WHERE emblem = '{item}')"
cursor.execute(sql)
# 커밋
connection.commit()
except Exception as e:
# 에러 발생 시 롤백
connection.rollback()
print(f"데이터 입력 중 오류 발생: {str(e)}")
finally:
# 연결 종료
connection.close()
X_train = df.drop(columns = 'price')
y_train = df['price']
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train,y_train, test_size=0.2, random_state=1)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
obj_col = X_train.select_dtypes(include='object').columns
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))
X_train_sc = sds.transform(X_train.drop(columns = obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns = obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns = obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns = obj_col).columns)
# object 타입 컬럼 붙여주기
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
# 스케일러 객체 저장
joblib.dump(sds, '/home/jovyan/scaler/scaler.pkl')
# OneHotEncoder 객체 생성
encoder = OneHotEncoder()
X_full = pd.concat([X_train_sc, X_validation_sc])
# 범주형 열만 선택
obj_df = X_full.select_dtypes(include='object')
# 숫자형 열만 선택
no_obj_df = X_full.select_dtypes(exclude='object')
# 범주형 열을 원핫 인코딩
encoded_features = encoder.fit_transform(obj_df)
# 인코딩된 결과를 데이터프레임으로 변환
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names(obj_df.columns))
# 인코딩된 범주형 열과 숫자형 열을 합침
X_train_sc_ec = pd.concat([no_obj_df[:len(X_train_sc)] , encoded_df[:len(X_train_sc)]], axis = 1)
X_validation_sc_ec = pd.concat([no_obj_df[len(X_train_sc):] , encoded_df[len(X_train_sc):].reset_index(drop=True)], axis = 1)
# 인코딩 객체 저장
joblib.dump(encoder, '/home/jovyan/encoder/encoder.pkl')
import re
# 데이터프레임의 컬럼 이름에서 특수 문자를 제거하고 변경할 새로운 컬럼 이름 리스트 생성
new_columns = []
for old_column in X_train_sc_ec.columns:
new_column = re.sub(r'[^\w\s]', '', old_column) # 특수 문자 제거
new_columns.append(new_column)
# 컬럼 이름을 새로운 이름으로 설정
X_train_sc_ec.columns = new_columns
X_validation_sc_ec.columns = new_columns
parser = argparse.ArgumentParser()
parser.add_argument('--learning_rate', required=False, type=float, default=0.1)
parser.add_argument('--n_estimators', required=False, type=int, default=100)
parser.add_argument('--max_depth', required=False, type=int, default=5)
args = parser.parse_args()
xgb_model = xgb.XGBRegressor(random_state=10,
learning_rate=args.learning_rate,
n_estimators=args.n_estimators,
max_depth=args.max_depth
)
xgb_model.fit(X_train_sc_ec, Y_train)
# 모델을 파일로 저장
joblib.dump(xgb_model, '/home/jovyan/model/model.pkl')
# 검증 데이터 예측
pred_validation = xgb_model.predict(X_validation_sc_ec)
# 성능평가
from sklearn.metrics import mean_absolute_error
mae_validation = mean_absolute_error(Y_validation, pred_validation)
print("mae_validation="+str(mae_validation))
requirements.txt
pandas==1.1.5
PyMySQL==1.1.0
pymongo
scikit-learn==0.24.2
numpy
joblib==1.1.1
xgboost==1.5.2
sudo docker build -t kubeflow-registry.default.svc.cluster.local:30000/rarepipline1:latest . >>> 이미지 만들기
sudo docker push kubeflow-registry.default.svc.cluster.local:30000/rarepipline1:latest >>> 이미지 레지스트리에 푸쉬
일단 테스트용으로 크론잡 돌려보자
completed후 pv에 스케일, 인코딩, 모델 객체들이 저장되는지 확인!!
1. 스케일 객체 경로 >>> sudo ls -l /srv/nfs-volume/default-scaler-pvc-pvc-3a3afd69-adbe-4c7d-abe8-a0ffc10ab344
2. 인코딩 객체 경로 >>> sudo ls -l /srv/nfs-volume/default-encoder-pvc-pvc-895d2ac0-a5bd-40e7-830c-955dd8e6dd34
3. 모델 객체 경로 >>> sudo ls -l /srv/nfs-volume/default-model-pvc-pvc-8acbcb25-35f3-4250-bacc-7a4acb4f1bdd

저장 완료!!
상급아바타도 똑같이 저장되는지 테스트 ㄱㄱ
이제 장고를 한번 띄워보자
장고 서비스 달아주기(노드포트)
apiVersion: v1
kind: Service
metadata:
name: donpa-django-service
spec:
type: NodePort
ports:
- protocol: TCP
port: 8000
targetPort: 8000 # 실제 MariaDB 파드의 포트
nodePort: 30001
selector:
app: donpa-django-rs
장고 디플로이먼트(만들어준 장고 이미지 가져오기)
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: donpa-django-rs
spec:
replicas: 2
selector:
matchLabels:
app: donpa-django-rs
template:
metadata:
labels:
app: donpa-django-rs
spec:
containers:
- name: donpa-django
image: sy02229/donpa_django:latest
ports:
- containerPort: 8000
volumeMounts:
- name: scaler-volume
mountPath: /home/jovyan/scaler
- name: encoder-volume
mountPath: /home/jovyan/encoder
- name: model-volume
mountPath: /home/jovyan/model
volumes:
- name: scaler-volume
persistentVolumeClaim:
claimName: scaler-pvc
- name: encoder-volume
persistentVolumeClaim:
claimName: encoder-pvc
- name: model-volume
persistentVolumeClaim:
claimName: model-pvc
http://외부ip:30001 접속

이제 kubeflow를 사용하여 하루마다 모델을 최적화해서 pv에 저장하는 파이프라인을 구성해보자
1) 모델링 전까지 이미지
2) 모델돌리는 카팁에 들어갈 이미지 만들기
3) 카팁 돌리기 이미지
4) 카팁 결과 가져와 모델 수정하는 이미지
모델돌리기전 train, vaildation csv들을 저장할 pvc생성
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
namespace: sy
name: csv-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi
모델링 전까지 이미지
main.py
import pandas as pd
import pymysql
from pymongo import MongoClient
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import argparse
# 연결 정보 설정
host = "10.233.18.183"
port = 3306
database = "donpa_item"
user = "root"
password = "1234"
# 연결
connection = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database
)
# SQL 쿼리
query = "SELECT * FROM goldprice"
# 데이터프레임으로 변환
date_df = pd.read_sql(query, connection)
# 연결 종료
connection.close
# MongoDB 연결 설정
client = MongoClient("mongodb://3.38.178.84:27017/")
db = client["donpa"] # 데이터베이스명
collection = db["donpa_aabata_rare"] # 컬렉션명
# 컬렉션 데이터 조회
data = collection.find()
# 데이터프레임으로 변환
df = pd.DataFrame(data)
# _id 컬럼은 제거해주자
df = df.drop(columns="_id")
df['price'] = df['price'].astype('float')
df['soldDate'] = df['soldDate'].str.replace('-','')
# sell, buy컬럼 붙여주자!!
df = df.merge(date_df, left_on='soldDate', right_on='date', how='left')
# sell, buy값이 널값이면 가장 최근의 sell, buy값으로 대체해주자
null_check = df['sell'].isnull().sum()
if null_check != 0:
# date_df의 맨 마지막 값을 가져와서 NaN 값을 채우기
last_date_df_row = date_df.iloc[-1]
df['sell'].fillna(last_date_df_row['sell'], inplace=True)
df['buy'].fillna(last_date_df_row['buy'], inplace=True)
# date컬럼 삭제해주자
df.drop(columns = 'date', inplace=True)
df['soldDate'] = pd.to_datetime(df['soldDate'])
# 년 컬럼 추가
df['year'] = df['soldDate'].dt.year
# 월 컬럼 추가
df['month'] = df['soldDate'].dt.month
# 일 컬럼 추가
df['day'] = df['soldDate'].dt.day
# 요일 컬럼 추가해주자
df['day_name'] = df['soldDate'].dt.day_name()
# 필요 없는 컬럼 삭제
df.drop(columns = 'soldDate', inplace=True)
# ava_rit도 어차피다 레어이니까 삭제
df.drop(columns = 'ava_rit', inplace=True)
# 타겟 컬러 맨뒤로 보내기
price_column = df.pop('price')
df['price'] = price_column
title_data = df['title'].drop_duplicates()
jobname_data = df['jobname'].drop_duplicates()
emblem_data = df['emblem'].drop_duplicates()
# 장고 select 목록 테이블에 넣어주는 코드
import pymysql
# 연결 정보 설정
host = "10.233.18.183"
port = 3306
database = "donpa_item"
user = "root"
password = "1234"
# 연결
connection = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database
)
try:
# 커서 생성
cursor = connection.cursor()
# 데이터 입력 (존재하지 않는 데이터만 입력)
for item in title_data:
sql = f"INSERT INTO input_list (title) SELECT '{item}' FROM DUAL WHERE NOT EXISTS (SELECT * FROM input_list WHERE title = '{item}')"
cursor.execute(sql)
for item in jobname_data:
sql = f"INSERT INTO input_list (jobname) SELECT '{item}' FROM DUAL WHERE NOT EXISTS (SELECT * FROM input_list WHERE jobname = '{item}')"
cursor.execute(sql)
for item in emblem_data:
sql = f"INSERT INTO input_list (emblem) SELECT '{item}' FROM DUAL WHERE NOT EXISTS (SELECT * FROM input_list WHERE emblem = '{item}')"
cursor.execute(sql)
# 커밋
connection.commit()
except Exception as e:
# 에러 발생 시 롤백
connection.rollback()
print(f"데이터 입력 중 오류 발생: {str(e)}")
finally:
# 연결 종료
connection.close()
X_train = df.drop(columns = 'price')
y_train = df['price']
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train,y_train, test_size=0.2, random_state=1)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
obj_col = X_train.select_dtypes(include='object').columns
sds = StandardScaler()
sds.fit(X_train.drop(columns = obj_col))
X_train_sc = sds.transform(X_train.drop(columns = obj_col))
X_train_sc = pd.DataFrame(X_train_sc, columns = X_train.drop(columns = obj_col).columns)
X_validation_sc = sds.transform(X_validation.drop(columns = obj_col))
X_validation_sc = pd.DataFrame(X_validation_sc, columns = X_validation.drop(columns = obj_col).columns)
# object 타입 컬럼 붙여주기
for i in obj_col:
X_train_sc[i] = X_train[i]
X_validation_sc[i] = X_validation[i]
# 스케일러 객체 저장
joblib.dump(sds, '/home/jovyan/scaler/scaler.pkl')
# OneHotEncoder 객체 생성
encoder = OneHotEncoder()
X_full = pd.concat([X_train_sc, X_validation_sc])
# 범주형 열만 선택
obj_df = X_full.select_dtypes(include='object')
# 숫자형 열만 선택
no_obj_df = X_full.select_dtypes(exclude='object')
# 범주형 열을 원핫 인코딩
encoded_features = encoder.fit_transform(obj_df)
# 인코딩된 결과를 데이터프레임으로 변환
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names(obj_df.columns))
# 인코딩된 범주형 열과 숫자형 열을 합침
X_train_sc_ec = pd.concat([no_obj_df[:len(X_train_sc)] , encoded_df[:len(X_train_sc)]], axis = 1)
X_validation_sc_ec = pd.concat([no_obj_df[len(X_train_sc):] , encoded_df[len(X_train_sc):].reset_index(drop=True)], axis = 1)
# 인코딩 객체 저장
# joblib.dump(encoder, '/home/jovyan/encoder/encoder.pkl')
import re
# 데이터프레임의 컬럼 이름에서 특수 문자를 제거하고 변경할 새로운 컬럼 이름 리스트 생성
new_columns = []
for old_column in X_train_sc_ec.columns:
new_column = re.sub(r'[^\w\s]', '', old_column) # 특수 문자 제거
new_columns.append(new_column)
# 컬럼 이름을 새로운 이름으로 설정
X_train_sc_ec.columns = new_columns
X_validation_sc_ec.columns = new_columns
X_train_sc_ec.to_csv('/X_train_sc_ec.csv', index=False)
X_validation_sc_ec.to_csv('/X_validation_sc_ec.csv', index=False)
Y_train.to_csv('/Y_train.csv', index=False)
Y_validation.to_csv('/Y_validation.csv', index=False)
X_train_sc_ec.to_csv('/home/jovyan/csv/X_train_sc_ec.csv', index=False)
X_validation_sc_ec.to_csv('/home/jovyan/csv/X_validation_sc_ec.csv', index=False)
Y_train.to_csv('/home/jovyan/csv/Y_train.csv', index=False)
Y_validation.to_csv('/home/jovyan/csv/Y_validation.csv', index=False)
dockerfile
FROM sy02229/new_image:latest
# 작업 디렉토리 설정
WORKDIR /app
# main 파일 작업디렉토리로 복사
COPY main.py /app/main.py
RUN mkdir csv
CMD [ "python" , "main.py" ]
requirements.txt
pandas==1.1.5
PyMySQL==1.1.0
pymongo
scikit-learn==0.24.2
numpy
joblib==1.1.1
xgboost==1.5.2
sudo docker build -t kubeflow-registry.default.svc.cluster.local:30000/rarestart:latest .
sudo docker push kubeflow-registry.default.svc.cluster.local:30000/rarestart:latest
일단 테스트를 먼저 진행하고 파이프라인을 구축해보자
카팁 야물파일
apiVersion: batch/v1
kind: Job
spec:
template:
metadata:
annotations:
sidecar.istio.io/inject: 'false'
spec:
containers:
- name: training-container
image: kubeflow-registry.default.svc.cluster.local:30000/rarepipline1:latest
command:
- python3
- /app/main.py
- '--learning_rate=${trialParameters.learning_rate}'
- '--n_estimators=${trialParameters.n_estimators}'
- '--max_depth=${trialParameters.max_depth}'
volumeMounts:
- name: scaler-volume
mountPath: /home/jovyan/scaler
- name: encoder-volume
mountPath: /home/jovyan/encoder
- name: model-volume
mountPath: /home/jovyan/model
- name: csv-volume
mountPath: /home/jovyan/csv
restartPolicy: Never
volumes:
- name: scaler-volume
persistentVolumeClaim:
claimName: scaler-pvc
- name: encoder-volume
persistentVolumeClaim:
claimName: encoder-pvc
- name: model-volume
persistentVolumeClaim:
claimName: model-pvc
- name: csv-volume
persistentVolumeClaim:
claimName: csv-pvc
실행 사진

최적의 값

자자 해야할것을 정리해보자
1. 카탑을 pv에 저장해두자, 저장한 야물을 가져와 실행시키자
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
namespace: sy
name: katib-yaml-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 1Gi
카팁야물파일
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
name: donpa-rare
namespace: 'sy'
spec:
maxTrialCount: 40
parallelTrialCount: 2
maxFailedTrialCount: 3
resumePolicy: Never
objective:
type: minimize
goal: 1000000
objectiveMetricName: mae_validation
additionalMetricNames: []
algorithm:
algorithmName: random
algorithmSettings: []
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: '0.01'
max: '0.3'
step: '0.05'
- name: 'n'
parameterType: int
feasibleSpace:
min: '30'
max: '200'
step: '1'
- name: d
parameterType: int
feasibleSpace:
min: '1'
max: '30'
step: '1'
metricsCollectorSpec:
collector:
kind: StdOut
trialTemplate:
primaryContainerName: training-container
successCondition: status.conditions.#(type=="Complete")#|#(status=="True")#
failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")#
retain: false
trialParameters:
- name: learning_rate
reference: lr
description: ''
- name: n_estimators
reference: 'n'
description: ''
- name: max_depth
reference: d
description: ''
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
metadata:
annotations:
sidecar.istio.io/inject: 'false'
spec:
containers:
- name: training-container
image: >-
kubeflow-registry.default.svc.cluster.local:30000/rarepipline1:latest
command:
- python3
- /app/main.py
- '--learning_rate=${trialParameters.learning_rate}'
- '--n_estimators=${trialParameters.n_estimators}'
- '--max_depth=${trialParameters.max_depth}'
volumeMounts:
- name: scaler-volume
mountPath: /home/jovyan/scaler
- name: encoder-volume
mountPath: /home/jovyan/encoder
- name: model-volume
mountPath: /home/jovyan/model
- name: csv-volume
mountPath: /home/jovyan/csv
restartPolicy: Never
volumes:
- name: scaler-volume
persistentVolumeClaim:
claimName: scaler-pvc
- name: encoder-volume
persistentVolumeClaim:
claimName: encoder-pvc
- name: model-volume
persistentVolumeClaim:
claimName: model-pvc
- name: csv-volume
persistentVolumeClaim:
claimName: csv-pvc
카팁 야물파일 실행 시키는 코드
from kubeflow.katib import KatibClient
import yaml
import argparse
katib_client = KatibClient()
test_yaml = "test.yaml"
with open(test_yaml, "r") as yaml_file:
experiment_config = yaml.load(yaml_file, Loader=yaml.FullLoader)
namespace = experiment_config['metadata']['namespace']
katib_client.create_experiment(experiment_config, namespace)
카팁 야물파일 실행후 최적의 매개변수 가져오는 코드
from kubeflow.katib import KatibClient
import yaml
import pandas as pd
import time
import argparse
yaml_path = "test.yaml"
with open(yaml_path, "r") as yaml_file:
test_yaml = yaml.load(yaml_file)
name = test_yaml['metadata']['name']
namespace = test_yaml['metadata']['namespace']
katib_client = KatibClient()
time.sleep(60)
while True:
time.sleep(10)
if katib_client.get_experiment_status(name,namespace) == 'Succeeded':
experiment = katib_client.get_experiment(name=name,namespace=namespace)
lr=experiment['status']['currentOptimalTrial']['parameterAssignments'][0]['value']
n=experiment['status']['currentOptimalTrial']['parameterAssignments'][1]['value']
d=experiment['status']['currentOptimalTrial']['parameterAssignments'][2]['value']
break
카팁을 실행하는 이미지를 만들어 주자
katib_start.py
from kubeflow.katib import KatibClient
import yaml
import argparse
test_yaml = '/app/exp/rare.yaml'
katib_client = KatibClient()
with open(test_yaml, "r") as yaml_file:
experiment_config = yaml.load(yaml_file)
namespace = experiment_config['metadata']['namespace']
try:
katib_client.create_experiment(experiment_config, namespace)
except:
pass
requirements.txt
kubeflow-katib
PyYAML==5.2
dockerfile
FROM python:3.9
ENV PYTHONUNBUFFERED 1
WORKDIR /app
COPY . /app/
RUN mkdir exp
RUN pip install --upgrade pip
RUN pip install -r requirements.txt
ENTRYPOINT ["python", "katib_start.py"]
sudo docker build -t kubeflow-registry.default.svc.cluster.local:30000/katib_start:latest .
sudo docker push kubeflow-registry.default.svc.cluster.local:30000/katib_start:latest
카팁을 완료시 최적의 매개변수로 모델을 저장하고 카팁을 종료시키는 이미지
model_update.py
from kubeflow.katib import KatibClient
from sklearn.metrics import mean_absolute_error
import yaml
import pandas as pd
import time
import argparse
import joblib
import xgboost as xgb
time.sleep(300)
while True:
test_yaml = '/app/exp/rare.yaml'
with open(test_yaml, "r") as yaml_file:
experiment_config = yaml.load(yaml_file)
name = experiment_config['metadata']['name']
namespace = experiment_config['metadata']['namespace']
katib_client = KatibClient()
time.sleep(10)
if katib_client.get_experiment_status(name,namespace) == 'Succeeded':
experiment = katib_client.get_experiment(name=name,namespace=namespace)
lr=experiment['status']['currentOptimalTrial']['parameterAssignments'][0]['value']
n=experiment['status']['currentOptimalTrial']['parameterAssignments'][1]['value']
d=experiment['status']['currentOptimalTrial']['parameterAssignments'][2]['value']
katib_client.delete_experiment(name,namespace)
new_model = xgb.XGBRegressor(random_state=10,
learning_rate=float(lr),
n_estimators=int(n),
max_depth=int(d)
)
X_train_sc_ec = pd.read_csv('/app/csv/X_train_sc_ec.csv')
X_validation_sc_ec = pd.read_csv('/app/csv/X_validation_sc_ec.csv')
Y_train = pd.read_csv('/app/csv/Y_train.csv')
Y_validation = pd.read_csv('/app/csv/Y_validation.csv')
# 새로운 모델 성능 평가
new_model.fit(X_train_sc_ec, Y_train)
new_pred_validation = new_model.predict(X_validation_sc_ec)
new_mae_validation = mean_absolute_error(Y_validation, new_pred_validation)
# 모델을 파일로 저장
joblib.dump(new_model, '/app/model/model.pkl')
break
dockerfile
FROM python:3.9
ENV PYTHONUNBUFFERED 1
WORKDIR /app
COPY . /app/
RUN mkdir exp
RUN mkdir csv
RUN mkdir model
RUN pip install --upgrade pip
RUN pip install -r requirements.txt
ENTRYPOINT ["python", "model_update.py"]
requirements.txt
kubeflow-katib
PyYAML==5.2
scikit-learn==0.24.2
pandas==1.1.5
joblib==1.1.1
xgboost==1.5.2
sudo docker build -t kubeflow-registry.default.svc.cluster.local:30000/model_update:latest .
sudo docker push kubeflow-registry.default.svc.cluster.local:30000/model_update:latest