상세 컨텐츠

본문 제목

[빅데이터분석기사] 실기 작업형2 정리

테크/ML

by fiftyline 2025. 5. 18. 20:20

본문

 

 

1. 데이터 세팅

import pandas as pd

# case 1
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test.csv")

# case 2
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
X_train = train.drop(['target'], axis = 1)
y_train = train['target']
X_test = test.copy()

# X 병합
n_train = X_train.shape[0]
X_full = pd.concat([X_train, X_test])

# 타겟 값 변경
y = (y_train['income'] != '>=50').astype(int) # 0|1

 

2. 전처리

X_full.info()
X_full.describe()

# 필요한 컬럼 남기기
X_full = X_full.drop(['Id', 'Name', ...], axis = 1)
y_train = y_train.drop(['Id'], axis = 1)

# 결측값 처리
X_full.isna().sum() #확인
X_full['Age'] = X_full['Age'].fillna(X_full['Age'].mean())
X_full['Team'] = X_full['Team'].fillna(X_full['Team'].mode()[0])

# 범주형 인코딩
# 1) One-Hot : 선형모델
X_full = pd.get_dummies(X_full)
# 2) Label : 트리모델
cat_cols = ['Sex', 'Team', 'Grade']
le = LabelEncoder()
for col in cat_cols:
    X_full[col] = le.fit_transform(X_full[col])
    
# X 분할
X_train = X_full[:n_train]
X_test = X_full[n_train:]

 


3. 학습 데이터분할

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

 

 

4. 모델 학습

# 분류
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# 회귀
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

 

 

5. 모델 평가

# 분류 - ACC
y_val_pred = model.predict(X_val)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_val, y_val_pred))

# 분류 - F1
y_val_pred = model.predict(X_val)
from sklearn.metrics import f1_score
f1_score(y_val, y_val_pred) #f1_score(y_val, pred, pos_label = 'A') :A가 양성
# 다중분류 - F1
f1 = f1_score(y_val, y_val_pred, average = 'macro') #average = micro, macro, weighted

# 분류 - ROC-AUC
y_val_proba = model.predict_proba(X_val)
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_val, y_val_proba)) #y_val_proba[:,1]

# 회귀 - R_squared
y_val_pred = model.predict(X_val)
from sklearn.metrics import r2_score
r2 = r2_score(y_val, y_val_pred)

# 회귀 - RMSE
y_val_pred = model.predict(X_val)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_val, y_val_pred)
rmse = mse ** 0.5

 

 

6. 예측 결과

# 분류 - 예측 및 정확도
y_test_pred = model.predict(X_test)
result = pd.DataFrame({'Id': test_id, 'pred': y_test_pred})
accuracy_score(y_test['target'], y_test_pred)

# 분류 - 확률
y_test_proba = model.predict_proba(X_test)
result = pd.DataFrame({'Id': test_id, 'pred':y_test_proba[:,1]})

 

 

 

 

 

 

 

관련글 더보기