Predicting Heart Disease Using Fundemental Machine Learning Algorithms¶
Data Loading and Cleaning¶
In [ ]:
# Loading Pandas Library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
from sklearn.model_selection import GridSearchCV
In [ ]:
# Load the Heart Disease UCI dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"]
heart_df = pd.read_csv(url,names=names, na_values="?")
In [ ]:
# Drop rows with missing values
heart_df.dropna(inplace=True)
heart_df = heart_df.reset_index(drop=True)
heart_df['heart_disease'] = heart_df['target'].apply(lambda x: 0 if x== 0 else 1) # transfroms target to be 0 for no heart disease, and 1 for any form of heart disease.
heart_df = heart_df.drop(labels='target', axis =1)
print(heart_df.isnull().sum())
print(heart_df.shape)
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 heart_disease 0 dtype: int64 (297, 14)
In [ ]:
# Display basic information about the dataset
print("Number of observations:", heart_df.shape[0])
print("Number of features:", heart_df.shape[1] - 1) # Exclude target column
print(heart_df.dtypes)
Number of observations: 297 Number of features: 13 age float64 sex float64 cp float64 trestbps float64 chol float64 fbs float64 restecg float64 thalach float64 exang float64 oldpeak float64 slope float64 ca float64 thal float64 heart_disease int64 dtype: object
In [ ]:
# One-hot encode cp variable
from sklearn.preprocessing import OneHotEncoder
# Instantiate OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
# Apply OneHotEncoder to the 'cp' column
encoded_features = encoder.fit_transform(heart_df[['cp']])
encoded_feature_names = encoder.get_feature_names_out(['cp'])
# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
# Drop the original 'cp' column
heart_df = heart_df.drop(columns=['cp'])
# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)
# Rename the new variables
heart_df = heart_df.rename(columns={"cp_1.0":"cp: typical angina", "cp_2.0":"cp: atypical angina", "cp_3.0":"cp: non-anginal pain", "cp_4.0":"cp: asymptomatic"})
# Display the updated DataFrame
print(heart_df.head())
age sex trestbps chol fbs restecg thalach exang oldpeak slope \ 0 63.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 1 67.0 1.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 2 67.0 1.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 3 37.0 1.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 4 41.0 0.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 ca thal heart_disease cp: typical angina cp: atypical angina \ 0 0.0 6.0 0 1.0 0.0 1 3.0 3.0 1 0.0 0.0 2 2.0 7.0 1 0.0 0.0 3 0.0 3.0 0 0.0 0.0 4 0.0 3.0 0 0.0 1.0 cp: non-anginal pain cp: asymptomatic 0 0.0 0.0 1 0.0 1.0 2 0.0 1.0 3 1.0 0.0 4 0.0 0.0
In [ ]:
# OneHotEncoder for 'slope' feature
encoder = OneHotEncoder(sparse_output=False)
# Apply OneHotEncoder to the 'slope' column
encoded_features = encoder.fit_transform(heart_df[['slope']])
encoded_feature_names = encoder.get_feature_names_out(['slope'])
# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
# Drop the original 'slope' column
heart_df = heart_df.drop(columns=['slope'])
# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)
# Rename the new variables
heart_df = heart_df.rename(columns={"slope_1.0":"upsloping", "slope_2.0":"flat", "slope_3.0":"downsloping"})
# Display the updated DataFrame
print(heart_df.head())
age sex trestbps chol fbs restecg thalach exang oldpeak ca \ 0 63.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 0.0 1 67.0 1.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 3.0 2 67.0 1.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 3 37.0 1.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 0.0 4 41.0 0.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 0.0 thal heart_disease cp: typical angina cp: atypical angina \ 0 6.0 0 1.0 0.0 1 3.0 1 0.0 0.0 2 7.0 1 0.0 0.0 3 3.0 0 0.0 0.0 4 3.0 0 0.0 1.0 cp: non-anginal pain cp: asymptomatic upsloping flat downsloping 0 0.0 0.0 0.0 0.0 1.0 1 0.0 1.0 0.0 1.0 0.0 2 0.0 1.0 0.0 1.0 0.0 3 1.0 0.0 0.0 0.0 1.0 4 0.0 0.0 1.0 0.0 0.0
In [ ]:
# OneHotEncoder for 'thal' feature
encoder = OneHotEncoder(sparse_output=False)
# Apply OneHotEncoder to the 'thal' column
encoded_features = encoder.fit_transform(heart_df[['thal']])
encoded_feature_names = encoder.get_feature_names_out(['thal'])
# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names,index=heart_df.index)
# Drop the original 'thal' column
heart_df = heart_df.drop(columns=['thal'])
# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)
# Rename the new variables
heart_df = heart_df.rename(columns={"thal_3.0":"thal: normal", "thal_6.0":"thal: fixed defect", "thal_7.0":"thal: reversable defect"})
# Display the updated DataFrame
print(heart_df.head())
age sex trestbps chol fbs restecg thalach exang oldpeak ca \ 0 63.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 0.0 1 67.0 1.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 3.0 2 67.0 1.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 3 37.0 1.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 0.0 4 41.0 0.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 0.0 ... cp: typical angina cp: atypical angina cp: non-anginal pain \ 0 ... 1.0 0.0 0.0 1 ... 0.0 0.0 0.0 2 ... 0.0 0.0 0.0 3 ... 0.0 0.0 1.0 4 ... 0.0 1.0 0.0 cp: asymptomatic upsloping flat downsloping thal: normal \ 0 0.0 0.0 0.0 1.0 0.0 1 1.0 0.0 1.0 0.0 1.0 2 1.0 0.0 1.0 0.0 0.0 3 0.0 0.0 0.0 1.0 1.0 4 0.0 1.0 0.0 0.0 1.0 thal: fixed defect thal: reversable defect 0 1.0 0.0 1 0.0 0.0 2 0.0 1.0 3 0.0 0.0 4 0.0 0.0 [5 rows x 21 columns]
In [ ]:
# OneHotEncoder for 'restecg' feature
encoder = OneHotEncoder(sparse_output=False)
# Apply OneHotEncoder to the 'restecg' column
encoded_features = encoder.fit_transform(heart_df[['restecg']])
encoded_feature_names = encoder.get_feature_names_out(['restecg'])
# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names,index=heart_df.index)
# Drop the original 'restecg' column
heart_df = heart_df.drop(columns=['restecg'])
# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)
# Rename the new variables
heart_df = heart_df.rename(columns={"restecg_0.0":"ecg: normal", "restecg_1.0":"ecg: ST-T wave abnormality", "restecg_2.0":"ecg: left ventricular hypertrophy"})
# Display the updated DataFrame
print(heart_df.head())
age sex trestbps chol fbs thalach exang oldpeak ca \ 0 63.0 1.0 145.0 233.0 1.0 150.0 0.0 2.3 0.0 1 67.0 1.0 160.0 286.0 0.0 108.0 1.0 1.5 3.0 2 67.0 1.0 120.0 229.0 0.0 129.0 1.0 2.6 2.0 3 37.0 1.0 130.0 250.0 0.0 187.0 0.0 3.5 0.0 4 41.0 0.0 130.0 204.0 0.0 172.0 0.0 1.4 0.0 heart_disease ... cp: asymptomatic upsloping flat downsloping \ 0 0 ... 0.0 0.0 0.0 1.0 1 1 ... 1.0 0.0 1.0 0.0 2 1 ... 1.0 0.0 1.0 0.0 3 0 ... 0.0 0.0 0.0 1.0 4 0 ... 0.0 1.0 0.0 0.0 thal: normal thal: fixed defect thal: reversable defect ecg: normal \ 0 0.0 1.0 0.0 0.0 1 1.0 0.0 0.0 0.0 2 0.0 0.0 1.0 0.0 3 1.0 0.0 0.0 1.0 4 1.0 0.0 0.0 0.0 ecg: ST-T wave abnormality ecg: left ventricular hypertrophy 0 0.0 1.0 1 0.0 1.0 2 0.0 1.0 3 0.0 0.0 4 0.0 1.0 [5 rows x 23 columns]
In [ ]:
# Looking for Outliers and investigating the data set.
heart_df.describe()[['age','trestbps','chol','thalach','oldpeak']]
Out[ ]:
age | trestbps | chol | thalach | oldpeak | |
---|---|---|---|---|---|
count | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 |
mean | 54.542088 | 131.693603 | 247.350168 | 149.599327 | 1.055556 |
std | 9.049736 | 17.762806 | 51.997583 | 22.941562 | 1.166123 |
min | 29.000000 | 94.000000 | 126.000000 | 71.000000 | 0.000000 |
25% | 48.000000 | 120.000000 | 211.000000 | 133.000000 | 0.000000 |
50% | 56.000000 | 130.000000 | 243.000000 | 153.000000 | 0.800000 |
75% | 61.000000 | 140.000000 | 276.000000 | 166.000000 | 1.600000 |
max | 77.000000 | 200.000000 | 564.000000 | 202.000000 | 6.200000 |
In [ ]:
# Checking chol for outliers
import plotly.express as px
fig = px.box(heart_df,y="chol")
fig.show()
# Dropping most extreme value
heart_df = heart_df[heart_df['chol'] != 564]
heart_df = heart_df.reset_index()
{% raw %} {% endraw %}
In [ ]:
# Checking oldpeak for outliers
import plotly.express as px
fig = px.box(heart_df,y="oldpeak")
fig.show()
# Dropping most extreme values
heart_df = heart_df[heart_df['oldpeak'] != 6.2]
heart_df = heart_df[heart_df['oldpeak'] != 5.6]
{% raw %} {% endraw %}
In [ ]:
# Min-max scaling the numerical features
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
heart_df['age'] = min_max.fit_transform(heart_df[['age']])
heart_df['trestbps'] = min_max.fit_transform(heart_df[['trestbps']])
heart_df['chol'] = min_max.fit_transform(heart_df[['chol']])
heart_df['thalach'] = min_max.fit_transform(heart_df[['thalach']])
heart_df['oldpeak'] = min_max.fit_transform(heart_df[['oldpeak']])
heart_df = heart_df.drop("index", axis=1)
heart_df.head()
Out[ ]:
age | sex | trestbps | chol | fbs | thalach | exang | oldpeak | ca | heart_disease | ... | cp: asymptomatic | upsloping | flat | downsloping | thal: normal | thal: fixed defect | thal: reversable defect | ecg: normal | ecg: ST-T wave abnormality | ecg: left ventricular hypertrophy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.708333 | 1.0 | 0.481132 | 0.367698 | 1.0 | 0.603053 | 0.0 | 0.522727 | 0.0 | 0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 0.791667 | 1.0 | 0.622642 | 0.549828 | 0.0 | 0.282443 | 1.0 | 0.340909 | 3.0 | 1 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
2 | 0.791667 | 1.0 | 0.245283 | 0.353952 | 0.0 | 0.442748 | 1.0 | 0.590909 | 2.0 | 1 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
3 | 0.166667 | 1.0 | 0.339623 | 0.426117 | 0.0 | 0.885496 | 0.0 | 0.795455 | 0.0 | 0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
4 | 0.250000 | 0.0 | 0.339623 | 0.268041 | 0.0 | 0.770992 | 0.0 | 0.318182 | 0.0 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 23 columns
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Histograms for numerical features
heart_df.hist(bins=20, figsize=(14, 10))
plt.show()
In [ ]:
# Feature Selection tool
def auto_feature_selector(dataset_path, methods=[]):
# uses methods below to find the best features through aggregating the results of the
# dataset_path - a pandas dataframe
# methods - a list of various feature selection methods we outlined before, use them all here (list)
# preprocessing
X, y, num_feats = preprocess_dataset(dataset_path)
# creating lists and setting them to False
feature_name = list(X.columns)
cor_support = [False for i in range(0, len(feature_name))]
chi_support = [False for i in range(0, len(feature_name))]
rfe_support = [False for i in range(0, len(feature_name))]
embedded_lr_support = [False for i in range(0, len(feature_name))]
embedded_rf_support = [False for i in range(0, len(feature_name))]
# Run every method we outlined above from the methods list and collect returned best features from every method
if 'pearson' in methods:
cor_support, cor_feature = cor_selector(X, y, num_feats)
if 'chi-square' in methods:
chi_support, chi_feature = chi_squared_selector(X, y, num_feats)
if 'rfe' in methods:
rfe_support, rfe_feature = rfe_selector(X, y, num_feats)
if 'log-reg' in methods:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
if 'rf' in methods:
embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
# Combine all the above feature list and count the maximum set of features that got selected by all methods
feature_selection_df = pd.DataFrame(
{'Feature': feature_name, 'Pearson': cor_support, 'Chi-2': chi_support, 'RFE': rfe_support,
'Logistics': embedded_lr_support,
'Random Forest': embedded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df.drop('Feature', axis=1), axis=1)
# display the top features
feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'], ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df) + 1)
best_features = feature_selection_df['Feature'][0:num_feats]
return best_features
def preprocess_dataset(dataset):
# splitting data set and picking number of features
X = heart_df.drop(['heart_disease'],axis=1)
y = heart_df['heart_disease']
num_feats = 9
return X, y, num_feats
# Method that finds best features using pearson correlation
def cor_selector(X, y,num_feats):
cor_list = []
feature_name = X.columns.tolist()
# calculate the correlation with y for each feature
for i in X.columns.tolist():
cor = np.corrcoef(X[i], y)[0, 1]
cor_list.append(cor)
# replace NaN with 0
cor_list = [0 if np.isnan(i) else i for i in cor_list]
cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
# feature selection? 0 for not select, 1 for select
cor_support = [True if i in cor_feature else False for i in feature_name]
return cor_support, cor_feature
# Method that finds best features using chi-squared
def chi_squared_selector(X, y, num_feats):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector =SelectKBest(chi2,k=num_feats)
chi_selector.fit(X_norm,y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
return chi_support, chi_feature
# Method that finds best features using rfe
def rfe_selector(X, y, num_feats):
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
X_norm= MinMaxScaler().fit_transform(X)
rfe_selector = RFE(estimator=LogisticRegression(),
n_features_to_select= num_feats,
step=10,verbose=0)
rfe_selector.fit(X_norm,y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
return rfe_support, rfe_feature
# Method that finds best features using logistic regression
def embedded_log_reg_selector(X, y, num_feats):
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
embedded_lr_selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=0),
max_features=num_feats)
embedded_lr_selector.fit(X_norm, y)
embedded_lr_support = embedded_lr_selector.get_support()
embedded_lr_feature = X.loc[:, embedded_lr_support].columns.tolist()
return embedded_lr_support, embedded_lr_feature
# Method that finds best features using Random Forest
def embedded_rf_selector(X, y, num_feats):
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
embedded_rf_select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0),
max_features=num_feats)
embedded_rf_select.fit(X, y)
embedded_rf_support = embedded_rf_select.get_support()
embedded_rf_feature = X.loc[:, embedded_rf_support].columns.tolist()
return embedded_rf_support, embedded_rf_feature
methods = ['pearson', 'chi-square', 'rfe', 'log-reg', 'rf']
best_features = auto_feature_selector(dataset_path=heart_df,methods=methods)
print("According to your selected methods, these are the best features: \n")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(best_features)
According to your selected methods, these are the best features: 1 cp: asymptomatic 2 ca 3 thalach 4 thal: reversable defect 5 thal: normal 6 oldpeak 7 exang 8 upsloping 9 trestbps Name: Feature, dtype: object
In [ ]:
# Create test and train datasets containing selected features
X = heart_df[['cp: asymptomatic','ca','thalach','thal: reversable defect','thal: normal','oldpeak','exang','upsloping','trestbps']]
y = heart_df['heart_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
In [ ]:
# Show the first few rows of the dataset
print(X_train.head())
heart_df.shape
heart_df.to_csv("heart_disease.csv",sep=",",index=False, encoding='utf-8')
Logistic Regression¶
Creating Basic Model¶
In [ ]:
# Importing necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import seaborn as sns
# Creating the model
logmodel = LogisticRegression(random_state=0,max_iter=10000).fit(X_train,y_train)
# Generating the predictions and creating the confusion matrix with the accuracy score
log_predictions = logmodel.predict(X_test)
train_predictions = logmodel.predict(X_train)
print("Recall (test set):",recall_score(y_test,log_predictions),
"\nAccuracy (train set):", accuracy_score(y_train,train_predictions),
"\nAccuracy (test set):", accuracy_score(y_test,log_predictions),
"\nPercision (test set):", precision_score(y_test,log_predictions),
"\nF1 score (test set):", f1_score(y_test,log_predictions),
"\nConfusion Matrix (test set):\n",confusion_matrix(y_test,log_predictions,labels=[1,0]).transpose())
In [ ]:
# Visualize Confusion Matrix
cm = confusion_matrix(y_test,log_predictions,labels=[1,0]).transpose()
# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Baseline Logistic Regression Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Tuning Hyperparameters¶
In [ ]:
# Creating a dictionary containing all hyperparameter values to consider.
grid_values = {'C': [0.001,0.01,0.1,1,10,100,1000],'tol':[0.0001,0.001,0.01,0.1,1,10]}
grid = GridSearchCV(logmodel,grid_values,refit=True,verbose=0,scoring='accuracy') # refits the estimator using the best found parameters
grid.fit(X_train,y_train)
# Getting the predicted values, as well as printing the accuracy and confusion matrix.
grid_predictions = grid.predict(X_test)
train_predictions = grid.predict(X_train)
print("Recall (test set):",recall_score(y_test,grid_predictions),
"\nAccuracy (train set):", accuracy_score(y_train,train_predictions),
"\nAccuracy (test set):", accuracy_score(y_test,grid_predictions),
"\nPercision (test set):", precision_score(y_test,grid_predictions),
"\nF1 score (test set):", f1_score(y_test,grid_predictions),
"\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
"\nBest Parameters:", grid.best_params_)
# Note the recall of this model is worse than the orginal. This is acceptable because it was created using cross validation.
# This means that though the data performs worse on the test set, it will likely perform better on average on new data points.
In [ ]:
# Visualize Confusion Matrix
cm = confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose()
# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Logistic Regression Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Dropping Features and Revaluating¶
In [ ]:
# Dropping least important variables and fitting a new model
dropped_X_train = X_train.drop(['trestbps','upsloping','exang'],axis=1)
dropped_X_test = X_test.drop(['trestbps','upsloping','exang'],axis=1)
grid.fit(dropped_X_train,y_train)
# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(dropped_X_test)
train_predictions = grid.predict(dropped_X_train)
print("Recall (test set):",recall_score(y_test,grid_predictions),
"\nAccuracy (train set):", accuracy_score(y_train,train_predictions),
"\nAccuracy (test set):", accuracy_score(y_test,grid_predictions),
"\nPercision (test set):", precision_score(y_test,grid_predictions),
"\nF1 score (test set):", f1_score(y_test,grid_predictions),
"\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
"\nBest Parameters:", grid.best_params_)
Recall (test set): 0.8292682926829268 Accuracy (train set): 0.8341463414634146 Accuracy (test set): 0.8539325842696629 Percision (test set): 0.85 F1 score (test set): 0.8395061728395061 Confusion Matrix: [[34 6] [ 7 42]] Best Parameters: {'C': 1, 'tol': 1}
In [ ]:
# Visualize Confusion Matrix
cm = confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose()
# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Logistic Regression Model with Dropped Features')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Decision Tree¶
Creating Basic Model¶
In [ ]:
# Importing necessary libraries
from sklearn.tree import DecisionTreeClassifier
# Creating the decision tree and fitting it with default values
DT_clf = DecisionTreeClassifier(random_state=0)
DT_clf.fit(X_train,y_train)
DT_test_predictions = DT_clf.predict(X_test)
DT_train_predictions = DT_clf.predict(X_train)
# Print evaluation metrics
print("Recall (test set):", recall_score(y_test, DT_test_predictions))
print("Accuracy (train set):", accuracy_score(y_train, DT_train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, DT_test_predictions))
print("Precision (test set):", precision_score(y_test, DT_test_predictions))
print("F1 score (test set):", f1_score(y_test, DT_test_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, DT_test_predictions, labels=[1, 0]).transpose())
Recall (test set): 0.7317073170731707 Accuracy (train set): 1.0 Accuracy (test set): 0.7078651685393258 Precision (test set): 0.6666666666666666 F1 score (test set): 0.6976744186046511 Confusion Matrix (test set): [[30 15] [11 33]]
In [ ]:
# Visualize Confusion Matrix
cm_DT = confusion_matrix(y_test,log_predictions,labels=[1,0]).transpose()
# Create DataFrame for confusion matrix
cm_df_DT = pd.DataFrame(cm_DT, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df_DT, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Decision Tree Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Tuning Hyperparameters¶
In [ ]:
# Creating a dictionary containing all hyperparameter values to consider
grid_values = {
'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Perform GridSearchCV to find the best hyperparameters
grid = GridSearchCV(DT_clf, grid_values, refit=True, verbose=0, scoring='accuracy')
grid.fit(X_train, y_train)
# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(X_test)
train_predictions = grid.predict(X_train)
print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)
# Note: The recall of this model might be worse than the original.
# This is acceptable because it was created using cross-validation.
# This means that though the data performs worse on the test set, it will likely perform better on average on new data points
Recall (test set): 0.7073170731707317 Accuracy (train set): 0.9170731707317074 Accuracy (test set): 0.7640449438202247 Precision (test set): 0.7631578947368421 F1 score (test set): 0.7341772151898733 Confusion Matrix (test set): [[29 9] [12 39]] Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
In [ ]:
# Visualize Confusion Matrix
cm_DT = confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose()
# Create DataFrame for confusion matrix
cm_df_DT = pd.DataFrame(cm_DT, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df_DT, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Decision Tree Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Dropping Features and Revaluating¶
In [ ]:
# Dropping least important variables and fitting a new model
dropped_X_train = X_train.drop(['trestbps', 'upsloping', 'exang'], axis=1)
dropped_X_test = X_test.drop(['trestbps', 'upsloping', 'exang'], axis=1)
grid.fit(dropped_X_train, y_train)
# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(dropped_X_test)
train_predictions = grid.predict(dropped_X_train)
print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)
Recall (test set): 0.6829268292682927 Accuracy (train set): 0.8926829268292683 Accuracy (test set): 0.7528089887640449 Precision (test set): 0.7567567567567568 F1 score (test set): 0.7179487179487181 Confusion Matrix: [[28 9] [13 39]] Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
In [ ]:
# Confusion Matrix
cm = confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose()
# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Decision Tree Model with Dropped Features')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Random Forest¶
Creating Basic Model¶
In [ ]:
# Importing Necessary Libraries
from sklearn.ensemble import RandomForestClassifier
# Creating the random forest and fitting it to the data.
RF_clf = RandomForestClassifier(random_state = 1)
RF_clf.fit(X_train,y_train)
RF_predictions = RF_clf.predict(X_test)
#Creating the confusion matrix and reporting recall
print("Recall score:",recall_score(y_test,RF_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,RF_predictions,labels=[1,0]).transpose())
Recall score: 0.6666666666666666 Confusion Matrix: [[28 7] [14 41]]
Tuning Hyperparameters¶
In [ ]:
# Creating a dictionary containing all hyperparameter values to consider.
grid_values = {
'n_estimators': [100, 200, 500],
'max_features': ['sqrt', 'log2'],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'criterion': ['gini', 'entropy']
}
# Creating the grid object itself, and fitting it to the data.
grid = GridSearchCV(RF_clf,grid_values,refit=True,verbose=0,scoring='recall') # finds optimal hyperparameter values (optimizing recall)
grid.fit(X_train,y_train)
# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
"\nBest Parameters:", grid.best_params_)
# Performs the same as the orginal model, but since it was created through cross validation, it will be more robust to new data.
Recall score: 0.7142857142857143 Confusion Matrix: [[30 4] [12 44]] Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
Dropping Features and Revaluating¶
In [ ]:
# Dropping cholesterol level and fitting a new model
dropped_X_train = X_train.drop(['chol','cp','age'],axis=1)
dropped_X_test = X_test.drop(['chol','cp','age'],axis=1)
grid.fit(dropped_X_train,y_train)
# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(dropped_X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
"\nBest Parameters:", grid.best_params_)
Recall score: 0.7380952380952381 Confusion Matrix: [[31 7] [11 41]] Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
Stochastic Gradient Descent¶
Creating Basic Model¶
In [ ]:
# Importing Necessary Libraries
from sklearn.linear_model import SGDClassifier
# Creating the SGD model
sgd_model = SGDClassifier(random_state=0, max_iter=10000).fit(X_train, y_train)
# Generating the predictions and creating the confusion matrix with the accuracy score
sgd_predictions = sgd_model.predict(X_test)
train_predictions = sgd_model.predict(X_train)
print("Recall (test set):", recall_score(y_test, sgd_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, sgd_predictions))
print("Precision (test set):", precision_score(y_test, sgd_predictions))
print("F1 score (test set):", f1_score(y_test, sgd_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, sgd_predictions, labels=[1, 0]).transpose())
Recall (test set): 0.8292682926829268 Accuracy (train set): 0.8536585365853658 Accuracy (test set): 0.8764044943820225 Precision (test set): 0.8947368421052632 F1 score (test set): 0.860759493670886 Confusion Matrix (test set): [[34 4] [ 7 44]]
In [ ]:
# Confusion Matrix Visualization
cm = confusion_matrix(y_test, sgd_predictions, labels=[1, 0]).transpose()
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for SGD Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Tuning Hyperparameters¶
In [ ]:
# Creating a dictionary containing all hyperparameter values to consider
grid_values = {
'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
'penalty': ['l2', 'l1', 'elasticnet'],
'max_iter': [1000, 5000, 10000]
}
# Perform GridSearchCV to find the best hyperparameters
grid = GridSearchCV(sgd_model, grid_values, refit=True, verbose=0, scoring='accuracy')
grid.fit(X_train, y_train)
# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(X_test)
train_predictions = grid.predict(X_train)
print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)
Recall (test set): 0.8780487804878049 Accuracy (train set): 0.8146341463414634 Accuracy (test set): 0.8876404494382022 Precision (test set): 0.8780487804878049 F1 score (test set): 0.8780487804878049 Confusion Matrix: [[36 5] [ 5 43]] Best Parameters: {'alpha': 0.001, 'max_iter': 1000, 'penalty': 'l2'}
In [ ]:
# Confusion Matrix Visualization
cm = confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose()
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for SGD Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Dropping Features and Revaluating¶
In [ ]:
# Perform GridSearchCV to find the best hyperparameters
grid = GridSearchCV(sgd_model, grid_values, refit=True, verbose=0, scoring='accuracy')
grid.fit(X_train, y_train)
# Dropping least important variables and fitting a new model
dropped_X_train = X_train.drop(['trestbps', 'upsloping', 'exang'], axis=1)
dropped_X_test = X_test.drop(['trestbps', 'upsloping', 'exang'], axis=1)
grid.fit(dropped_X_train, y_train)
# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(dropped_X_test)
train_predictions = grid.predict(dropped_X_train)
print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)
Recall (test set): 0.7560975609756098 Accuracy (train set): 0.8146341463414634 Accuracy (test set): 0.8314606741573034 Precision (test set): 0.8611111111111112 F1 score (test set): 0.8051948051948052 Confusion Matrix (test set): [[31 5] [10 43]] Best Parameters: {'alpha': 0.01, 'max_iter': 1000, 'penalty': 'l2'}
In [ ]:
# Confusion Matrix Visualization
cm = confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose()
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for SGD Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Support Vector Machine¶
Creating Basic Model¶
In [ ]:
# Importing Necessary Libraries
from sklearn.svm import SVC
# Creating the random forest and fitting it to the data.
SVM_model = SVC(random_state=1)
SVM_model.fit(X_train,y_train)
SVM_predictions = SGD_model.predict(X_test)
#Creating the confusion matrix and reporting recall
print("Recall score:",recall_score(y_test,SVM_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,SVM_predictions,labels=[1,0]).transpose())
Recall score: 0.6904761904761905 Confusion Matrix: [[29 11] [13 37]]
Tuning Hyperparameters¶
In [ ]:
# Creating a dictionary containing all hyperparameter values to consider.
grid_values = {'C': [0.1,1,10,100,1000], 'gamma': [1,0.1,0.001,0.0001],'kernel':['rbf']}
# Creating the grid object itself, and fitting it to the data.
grid = GridSearchCV(SVM_model,grid_values,refit=True,verbose=0,scoring='recall') # finds optimal hyperparameter values (optimizing recall)
grid.fit(X_train,y_train)
# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
"\nBest Parameters:", grid.best_params_)
# Performs the same as the orginal model, but since it was created through cross validation, it will be more robust to new data.
Recall score: 0.7142857142857143 Confusion Matrix: [[30 5] [12 43]] Best Parameters: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
Dropping Features and Revaluating¶
In [ ]:
# Dropping cholesterol level and fitting a new model
dropped_X_train = X_train.drop(['chol','cp','age'],axis=1)
dropped_X_test = X_test.drop(['chol','cp','age'],axis=1)
grid.fit(dropped_X_train,y_train)
# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(dropped_X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
"\nBest Parameters:", grid.best_params_)
Recall score: 0.7380952380952381 Confusion Matrix: [[31 6] [11 42]] Best Parameters: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}