import pandas as pd
# case 1
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test.csv")
# case 2
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
X_train = train.drop(['target'], axis = 1)
y_train = train['target']
X_test = test.copy()
# X 병합
n_train = X_train.shape[0]
X_full = pd.concat([X_train, X_test])
# 타겟 값 변경
y = (y_train['income'] != '>=50').astype(int) # 0|1
X_full.info()
X_full.describe()
# 필요한 컬럼 남기기
X_full = X_full.drop(['Id', 'Name', ...], axis = 1)
y_train = y_train.drop(['Id'], axis = 1)
# 결측값 처리
X_full.isna().sum() #확인
X_full['Age'] = X_full['Age'].fillna(X_full['Age'].mean())
X_full['Team'] = X_full['Team'].fillna(X_full['Team'].mode()[0])
# 범주형 인코딩
# 1) One-Hot : 선형모델
X_full = pd.get_dummies(X_full)
# 2) Label : 트리모델
cat_cols = ['Sex', 'Team', 'Grade']
le = LabelEncoder()
for col in cat_cols:
X_full[col] = le.fit_transform(X_full[col])
# X 분할
X_train = X_full[:n_train]
X_test = X_full[n_train:]
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
# 분류
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
# 회귀
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
# 분류 - ACC
y_val_pred = model.predict(X_val)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_val, y_val_pred))
# 분류 - F1
y_val_pred = model.predict(X_val)
from sklearn.metrics import f1_score
f1_score(y_val, y_val_pred) #f1_score(y_val, pred, pos_label = 'A') :A가 양성
# 다중분류 - F1
f1 = f1_score(y_val, y_val_pred, average = 'macro') #average = micro, macro, weighted
# 분류 - ROC-AUC
y_val_proba = model.predict_proba(X_val)
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_val, y_val_proba)) #y_val_proba[:,1]
# 회귀 - R_squared
y_val_pred = model.predict(X_val)
from sklearn.metrics import r2_score
r2 = r2_score(y_val, y_val_pred)
# 회귀 - RMSE
y_val_pred = model.predict(X_val)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_val, y_val_pred)
rmse = mse ** 0.5
# 분류 - 예측 및 정확도
y_test_pred = model.predict(X_test)
result = pd.DataFrame({'Id': test_id, 'pred': y_test_pred})
accuracy_score(y_test['target'], y_test_pred)
# 분류 - 확률
y_test_proba = model.predict_proba(X_test)
result = pd.DataFrame({'Id': test_id, 'pred':y_test_proba[:,1]})
[ML] 최적화 기법 (경사 하강법, SGD, 미니배치경사하강법, 모멘텀, RMSprop, Adam, 베이지안 최적화) (0) | 2025.02.16 |
---|---|
[ML] 최적화란? (0) | 2025.02.16 |
[ML/python] 앙상블 (voting, bagging, boosting, stacking, 랜덤포레스트, XGBoost, LightGBM) (0) | 2025.02.15 |
[ML/python] 신경망 (Neural Network, 오류 역전파 알고리즘, 최대 이터레이션) (0) | 2025.02.14 |
[ML/python] 결정 트리 (decision tree, DTC, class_weight, export_text, plot_tree) (0) | 2025.02.14 |