2026년 3학년 1학기 기계학습 프로젝트 과제

1. 기본 학습

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

predictions = model.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
print(accuracy)

train_predictions = model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)

test_predictions = model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)

print(train_acc)
print(test_acc)

 

accuracy : 0.8065
train predictoins : 0.798375
test predictions : 0.8065

 

2. class_weight='balanced' 추가

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(x_train, y_train)

predictions = model.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
print(accuracy)

train_predictions = model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)

test_predictions = model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)

print(train_acc)
print(test_acc)

accuracy: 0.498
train accuracy: 0.525125
test accuracy: 0.498

 

 

 

3. 평가 지표 변경: Confusion Matrix

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(x_train, y_train)

predictions = model.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
print(f"accuracy: {accuracy}")

train_predictions = model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)

test_predictions = model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)

print(f"train accuracy: {train_acc}")
print(f"test accuracy: {test_acc}")


print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification_report: ")
print(classification_report(y_test, predictions))
accuracy: 0.498
train accuracy: 0.525125
test accuracy: 0.498
Confusion Matrix:
[[822 791]
 [213 174]]
Classification_report: 
              precision    recall  f1-score   support

           0       0.79      0.51      0.62      1613
           1       0.18      0.45      0.26       387

    accuracy                           0.50      2000
   macro avg       0.49      0.48      0.44      2000
weighted avg       0.68      0.50      0.55      2000

 

1. 오차 행렬: False Positive, False Negative가 많음

2. 평가 지표

- Accuracy: 0.5

- Precision: 0.18, 예측한 1중 진짜 1은 18%.

- Recall: 0.45, 숨어있는 진짜 1 데이터 387개 중에서 45%를 찾아냄.

 

4. SMOTE

: 데이터가 label 0과 label 1 사이의 개수 차이가 많이 나기에 데이터 증강함.

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# smote 추가
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# smote 적용한 데이터 학습
model_smote = LogisticRegression(class_weight='balanced', max_iter=1000)
model_smote.fit(x_train_smote, y_train_smote)

# ==== smote 결과 값 ====
print("==== SMOTE로 데이터 증강 후 결과 ====")
train_predictions_smote = model_smote.predict(x_train_smote)
train_acc_smote = accuracy_score(y_train_smote, train_predictions_smote)

test_predictions_smote = model_smote.predict(x_test)
test_acc_smote = accuracy_score(y_test, test_predictions_smote)

print(f"train accuracy: {train_acc_smote}")
print(f"test accuracy: {test_acc_smote}")


# ==== Confusion Matrix / Classification Report ====

print("Confusion Matrix:")
print(confusion_matrix(y_test, test_predictions_smote))

print("Classification_report: ")
print(classification_report(y_test, test_predictions_smote))
==== SMOTE로 데이터 증강 후 결과 ====
train accuracy: 0.6342570847033036
test accuracy: 0.5785
Confusion Matrix:
[[1004  609]
 [ 234  153]]
Classification_report: 
              precision    recall  f1-score   support

           0       0.81      0.62      0.70      1613
           1       0.20      0.40      0.27       387

    accuracy                           0.58      2000
   macro avg       0.51      0.51      0.49      2000
weighted avg       0.69      0.58      0.62      2000

1. 오차행렬: 여전히 False positive가 많음

2. 평가지표

- Accuracy: 0.5 -> 0.58

- Precision: 0.18 -> 0.20

- Recall: 0.45 -> 0.40

 

5. SMOTE + Threshold / Threshold

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd

from imblearn.over_sampling import SMOTE

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# smote 추가
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# smote 적용한 데이터 학습
model_smote = LogisticRegression(class_weight='balanced', max_iter=1000)
model_smote.fit(x_train_smote, y_train_smote)

probabilities = model_smote.predict_proba(x_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for threshold in np.arange(0.1, 0.9 , 0.01):
    custom_predictions = (probabilities >= threshold).astype(int)
    
    current_f1 = f1_score(y_test, custom_predictions)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

print(f"최적의 Threshold: {best_threshold:.2f}")
print(F"최고 F1-Score: {best_f1:.4f}")

최적의 Threshold: 0.15
최고 F1-Score: 0.3255

 

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd

from imblearn.over_sampling import SMOTE

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(x_train, y_train)

predictions = model.predict(x_test)

probabilities = model.predict_proba(x_test)[:, 1]

best_threshold = 0.5
best_f1 = 0

for threshold in np.arange(0.1, 0.9 , 0.01):
    custom_predictions = (probabilities >= threshold).astype(int)
    
    current_f1 = f1_score(y_test, custom_predictions)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

print(f"최적의 Threshold: {best_threshold:.2f}")
print(F"최고 F1-Score: {best_f1:.4f}")

최적의 Threshold: 0.44
최고 F1-Score: 0.3246

 

6. SGDClassifier

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd

from imblearn.over_sampling import SMOTE

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sgd_model = SGDClassifier(loss='log_loss',
                        early_stopping=True,
                        n_iter_no_change=5,
                        max_iter=1000,
                        class_weight='balanced',
                        random_state=42)

sgd_model.fit(x_train, y_train)
predictions = sgd_model.predict(x_test)

print("=== over/underfitting check ===")

train_predictions = sgd_model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)

test_predictions = sgd_model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)

print(f"train accuracy: {train_acc}")
print(f"test accuracy: {test_acc}")

print("=== Confusion Matrix / Classification Report ===")

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification_report: ")
print(classification_report(y_test, predictions))
=== over/underfitting check ===
train accuracy: 0.555125
test accuracy: 0.5505
=== Confusion Matrix / Classification Report ===
Confusion Matrix:
[[944 669]
 [230 157]]
Classification_report: 
              precision    recall  f1-score   support

           0       0.80      0.59      0.68      1613
           1       0.19      0.41      0.26       387

    accuracy                           0.55      2000
   macro avg       0.50      0.50      0.47      2000
weighted avg       0.69      0.55      0.60      2000

 

 

7. GridSearchCV

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd

from imblearn.over_sampling import SMOTE

df = pd.read_csv('heart_disease_missing_processed.csv')

X = df.drop('Heart Disease Status', axis=1) # 'Heart Disease Status' 열을 제외한 나머지를 X로
y = df['Heart Disease Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'C':[0.01, 0.1, 1, 10, 100],
    'penalty':['l1', 'l2'],
    'solver':['liblinear', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', max_iter=2000),
                        param_grid,
                        cv=5,
                        scoring='f1')
grid_search.fit(x_train, y_train)

print(f"가장 성능이 좋은 설정값: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

print("=== over/underfitting check ===")

train_predictions = best_model.predict(x_train)
train_acc = accuracy_score(y_train, train_predictions)

test_predictions = best_model.predict(x_test)
test_acc = accuracy_score(y_test, test_predictions)

print(f"train accuracy: {train_acc}")
print(f"test accuracy: {test_acc}")

print("=== Confusion Matrix / Classification Report ===")

print("Confusion Matrix:")
print(confusion_matrix(y_test, test_predictions))

print("Classification_report: ")
print(classification_report(y_test, test_predictions))
가장 성능이 좋은 설정값: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
=== over/underfitting check ===
train accuracy: 0.522375
test accuracy: 0.494
=== Confusion Matrix / Classification Report ===
Confusion Matrix:
[[812 801]
 [211 176]]
Classification_report: 
              precision    recall  f1-score   support

           0       0.79      0.50      0.62      1613
           1       0.18      0.45      0.26       387

    accuracy                           0.49      2000
   macro avg       0.49      0.48      0.44      2000
weighted avg       0.68      0.49      0.55      2000

 

 

전체적인 평가

F1-Score가 0.26~0.32를 벗어나지 못함. Accuracy도 0.58이 가장 높은 정도.

Linear Regression으로 분석할 만큼 간단한 문제가 아니다.