1. 데이터 세팅
import pandas as pd
# X,y 세팅: case 1
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test.csv")
# X,y 세팅: case 2
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
X_train = train.drop(['target'], axis = 1).copy()
y_train = train['target'].copy()
X_test = test.copy()
# X 병합
n_train = X_train.shape[0]
X_full = pd.concat([X_train, X_test])
# 필요한 컬럼 남기기
X_full = X_full.drop(['Id', 'Name', ...], axis = 1)
y_train = y_train.drop(['Id'], axis = 1)
# 타겟 값 변경
y_train = (y_train['income'] != '>=50').astype(int) # 0|1
# 클래스 불균형 확인
y_train.value_counts()
2. 전처리
X_full.info()
X_full.describe()
# 결측값 처리
X_full.isna().sum() #확인
X_full['Age'] = X_full['Age'].fillna(X_full['Age'].mean())
X_full['Team'] = X_full['Team'].fillna(X_full['Team'].mode()[0])
# 날짜형 처리
X_full['Date'] = pd.to_datetime(X_full['Date'])
X_full['month'] = X_full['Date'].dt.month
X_full['weekday'] = X_full['Date'].dt.weekday
X_full = X_full.drop('Date',axis = 1)
# 범주형 인코딩
# 1) One-Hot : 선형모델
X_full = pd.get_dummies(X_full)
# 2) Label : 트리모델
from sklearn.preprocessing import LabelEncoder
cat_cols = ['Sex', 'Team', 'Grade']
le = LabelEncoder()
for col in cat_cols:
X_full[col] = le.fit_transform(X_full[col])
# X 분할
X_train = X_full[:n_train]
X_test = X_full[n_train:]
3. 학습 데이터분할
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, stratify=y_train)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
# 분류문제 -> stratify=y_train 옵션(층화추출)
4. 모델 학습
# 랜덤포레스트 - 분류
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
# 랜덤포레스트 - 회귀
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
# XGBoost - 분류
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred_label = le.inverse_transform(y_pred)
# XGBoost - 회귀
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train, y_train)
5. 모델 평가
# 분류 - ACC
y_val_pred = model.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_val_pred)
# 분류 - F1
y_val_pred = model.predict(X_val)
from sklearn.metrics import f1_score
f1_score(y_val, y_val_pred) #f1_score(y_val, pred, pos_label = 'A') :A가 양성
# (다중분류)
f1_score(y_val, y_val_pred, average = 'macro') #average = macro(클래스균등), weighted(불균형)
# 분류 - ROC-AUC
y_val_proba = model.predict_proba(X_val)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_val_proba) #y_val_proba[:,1]
# (다중분류)
roc_auc_score(y_val, y_val_proba, multi_class='ovr')
# 회귀 - R_squared
y_val_pred = model.predict(X_val)
from sklearn.metrics import r2_score
r2_score(y_val, y_val_pred)
# 회귀 - RMSE
y_val_pred = model.predict(X_val)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_val, y_val_pred)
rmse = mse ** 0.5
6. 예측 결과
# 분류 - 예측 및 정확도
y_test_pred = model.predict(X_test)
result = pd.DataFrame({'Id': test_id, 'pred': y_test_pred})
accuracy_score(y_test['target'], y_test_pred)
# 분류 - 확률
y_test_proba = model.predict_proba(X_test)
result = pd.DataFrame({'Id': test_id, 'pred':y_test_proba[:,1]})
'⚙️ Tech > ML' 카테고리의 다른 글
[자격증] 빅데이터분석기사 실기 작업형3 정리 (1) | 2025.06.09 |
---|---|
[자격증] 빅데이터분석기사 실기 작업형1 정리 (0) | 2025.06.08 |
[ML] 최적화 기법 (경사 하강법, SGD, 미니배치경사하강법, 모멘텀, RMSprop, Adam, 베이지안 최적화) (0) | 2025.02.16 |
[ML] 최적화란? (0) | 2025.02.16 |
[ML/python] 앙상블 (voting, bagging, boosting, stacking, 랜덤포레스트, XGBoost, LightGBM) (0) | 2025.02.15 |