2026년 3학년 1학기 기계학습 프로젝트 과제
1. 기본 학습
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(accuracy)
train_predictions = model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)
test_predictions = model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)
print(train_acc)
print(test_acc)
accuracy : 0.8065
train predictoins : 0.798375
test predictions : 0.8065
2. class_weight='balanced' 추가
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(accuracy)
train_predictions = model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)
test_predictions = model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)
print(train_acc)
print(test_acc)
accuracy: 0.498
train accuracy: 0.525125
test accuracy: 0.498
3. 평가 지표 변경: Confusion Matrix
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f"accuracy: {accuracy}")
train_predictions = model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)
test_predictions = model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)
print(f"train accuracy: {train_acc}")
print(f"test accuracy: {test_acc}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification_report: ")
print(classification_report(y_test, predictions))
accuracy: 0.498
train accuracy: 0.525125
test accuracy: 0.498
Confusion Matrix:
[[822 791]
[213 174]]
Classification_report:
precision recall f1-score support
0 0.79 0.51 0.62 1613
1 0.18 0.45 0.26 387
accuracy 0.50 2000
macro avg 0.49 0.48 0.44 2000
weighted avg 0.68 0.50 0.55 2000
1. 오차 행렬: False Positive, False Negative가 많음
2. 평가 지표
- Accuracy: 0.5
- Precision: 0.18, 예측한 1중 진짜 1은 18%.
- Recall: 0.45, 숨어있는 진짜 1 데이터 387개 중에서 45%를 찾아냄.
4. SMOTE
: 데이터가 label 0과 label 1 사이의 개수 차이가 많이 나기에 데이터 증강함.
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# smote 추가
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
# smote 적용한 데이터 학습
model_smote = LogisticRegression(class_weight='balanced', max_iter=1000)
model_smote.fit(x_train_smote, y_train_smote)
# ==== smote 결과 값 ====
print("==== SMOTE로 데이터 증강 후 결과 ====")
train_predictions_smote = model_smote.predict(x_train_smote)
train_acc_smote = accuracy_score(y_train_smote, train_predictions_smote)
test_predictions_smote = model_smote.predict(x_test)
test_acc_smote = accuracy_score(y_test, test_predictions_smote)
print(f"train accuracy: {train_acc_smote}")
print(f"test accuracy: {test_acc_smote}")
# ==== Confusion Matrix / Classification Report ====
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_predictions_smote))
print("Classification_report: ")
print(classification_report(y_test, test_predictions_smote))
==== SMOTE로 데이터 증강 후 결과 ====
train accuracy: 0.6342570847033036
test accuracy: 0.5785
Confusion Matrix:
[[1004 609]
[ 234 153]]
Classification_report:
precision recall f1-score support
0 0.81 0.62 0.70 1613
1 0.20 0.40 0.27 387
accuracy 0.58 2000
macro avg 0.51 0.51 0.49 2000
weighted avg 0.69 0.58 0.62 2000
1. 오차행렬: 여전히 False positive가 많음
2. 평가지표
- Accuracy: 0.5 -> 0.58
- Precision: 0.18 -> 0.20
- Recall: 0.45 -> 0.40
5. SMOTE + Threshold / Threshold
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# smote 추가
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
# smote 적용한 데이터 학습
model_smote = LogisticRegression(class_weight='balanced', max_iter=1000)
model_smote.fit(x_train_smote, y_train_smote)
probabilities = model_smote.predict_proba(x_test)[:, 1]
best_threshold = 0.5
best_f1 = 0
for threshold in np.arange(0.1, 0.9 , 0.01):
custom_predictions = (probabilities >= threshold).astype(int)
current_f1 = f1_score(y_test, custom_predictions)
if current_f1 > best_f1:
best_f1 = current_f1
best_threshold = threshold
print(f"최적의 Threshold: {best_threshold:.2f}")
print(F"최고 F1-Score: {best_f1:.4f}")
최적의 Threshold: 0.15
최고 F1-Score: 0.3255
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
probabilities = model.predict_proba(x_test)[:, 1]
best_threshold = 0.5
best_f1 = 0
for threshold in np.arange(0.1, 0.9 , 0.01):
custom_predictions = (probabilities >= threshold).astype(int)
current_f1 = f1_score(y_test, custom_predictions)
if current_f1 > best_f1:
best_f1 = current_f1
best_threshold = threshold
print(f"최적의 Threshold: {best_threshold:.2f}")
print(F"최고 F1-Score: {best_f1:.4f}")
최적의 Threshold: 0.44
최고 F1-Score: 0.3246
6. SGDClassifier
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sgd_model = SGDClassifier(loss='log_loss',
early_stopping=True,
n_iter_no_change=5,
max_iter=1000,
class_weight='balanced',
random_state=42)
sgd_model.fit(x_train, y_train)
predictions = sgd_model.predict(x_test)
print("=== over/underfitting check ===")
train_predictions = sgd_model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)
test_predictions = sgd_model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)
print(f"train accuracy: {train_acc}")
print(f"test accuracy: {test_acc}")
print("=== Confusion Matrix / Classification Report ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification_report: ")
print(classification_report(y_test, predictions))
=== over/underfitting check ===
train accuracy: 0.555125
test accuracy: 0.5505
=== Confusion Matrix / Classification Report ===
Confusion Matrix:
[[944 669]
[230 157]]
Classification_report:
precision recall f1-score support
0 0.80 0.59 0.68 1613
1 0.19 0.41 0.26 387
accuracy 0.55 2000
macro avg 0.50 0.50 0.47 2000
weighted avg 0.69 0.55 0.60 2000
7. GridSearchCV
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.read_csv('heart_disease_missing_processed.csv')
X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid = {
'C':[0.01, 0.1, 1, 10, 100],
'penalty':['l1', 'l2'],
'solver':['liblinear', 'saga']
}
grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', max_iter=2000),
param_grid,
cv=5,
scoring='f1')
grid_search.fit(x_train, y_train)
print(f"가장 성능이 좋은 설정값: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
print("=== over/underfitting check ===")
train_predictions = best_model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)
test_predictions = best_model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)
print(f"train accuracy: {train_acc}")
print(f"test accuracy: {test_acc}")
print("=== Confusion Matrix / Classification Report ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_predictions))
print("Classification_report: ")
print(classification_report(y_test, test_predictions))
가장 성능이 좋은 설정값: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
=== over/underfitting check ===
train accuracy: 0.522375
test accuracy: 0.494
=== Confusion Matrix / Classification Report ===
Confusion Matrix:
[[812 801]
[211 176]]
Classification_report:
precision recall f1-score support
0 0.79 0.50 0.62 1613
1 0.18 0.45 0.26 387
accuracy 0.49 2000
macro avg 0.49 0.48 0.44 2000
weighted avg 0.68 0.49 0.55 2000
전체적인 평가
F1-Score가 0.26~0.32를 벗어나지 못함. Accuracy도 0.58이 가장 높은 정도.
Linear Regression으로 분석할 만큼 간단한 문제가 아니다.
'인공지능 > 머신러닝' 카테고리의 다른 글
| [팀 프로젝트] AI가 쓴 글과 인간이 쓴 글 분류하는 인공지능 만들기 프로젝트 (1) | 2026.06.03 |
|---|