# Loading Pandas Library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
from sklearn.model_selection import GridSearchCV

# Load the Heart Disease UCI dataset

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"]
heart_df = pd.read_csv(url,names=names, na_values="?")

# Drop rows with missing values
heart_df.dropna(inplace=True)
heart_df = heart_df.reset_index(drop=True)
heart_df['heart_disease'] = heart_df['target'].apply(lambda x: 0 if x== 0 else 1) # transfroms target to be 0 for no heart disease, and 1 for any form of heart disease. 
heart_df = heart_df.drop(labels='target', axis =1)
print(heart_df.isnull().sum())
print(heart_df.shape)

age              0
sex              0
cp               0
trestbps         0
chol             0
fbs              0
restecg          0
thalach          0
exang            0
oldpeak          0
slope            0
ca               0
thal             0
heart_disease    0
dtype: int64
(297, 14)

# Display basic information about the dataset

print("Number of observations:", heart_df.shape[0])
print("Number of features:", heart_df.shape[1] - 1)  # Exclude target column 
print(heart_df.dtypes)

Number of observations: 297
Number of features: 13
age              float64
sex              float64
cp               float64
trestbps         float64
chol             float64
fbs              float64
restecg          float64
thalach          float64
exang            float64
oldpeak          float64
slope            float64
ca               float64
thal             float64
heart_disease      int64
dtype: object

# One-hot encode cp variable
from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply OneHotEncoder to the 'cp' column
encoded_features = encoder.fit_transform(heart_df[['cp']])
encoded_feature_names = encoder.get_feature_names_out(['cp'])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Drop the original 'cp' column
heart_df = heart_df.drop(columns=['cp'])

# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)

# Rename the new variables
heart_df = heart_df.rename(columns={"cp_1.0":"cp: typical angina", "cp_2.0":"cp: atypical angina", "cp_3.0":"cp: non-anginal pain", "cp_4.0":"cp: asymptomatic"})

# Display the updated DataFrame
print(heart_df.head())

    age  sex  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0  63.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3    3.0   
1  67.0  1.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5    2.0   
2  67.0  1.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6    2.0   
3  37.0  1.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5    3.0   
4  41.0  0.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4    1.0   

    ca  thal  heart_disease  cp: typical angina  cp: atypical angina  \
0  0.0   6.0              0                 1.0                  0.0   
1  3.0   3.0              1                 0.0                  0.0   
2  2.0   7.0              1                 0.0                  0.0   
3  0.0   3.0              0                 0.0                  0.0   
4  0.0   3.0              0                 0.0                  1.0   

   cp: non-anginal pain  cp: asymptomatic  
0                   0.0               0.0  
1                   0.0               1.0  
2                   0.0               1.0  
3                   1.0               0.0  
4                   0.0               0.0

# OneHotEncoder for 'slope' feature
encoder = OneHotEncoder(sparse_output=False)

# Apply OneHotEncoder to the 'slope' column
encoded_features = encoder.fit_transform(heart_df[['slope']])
encoded_feature_names = encoder.get_feature_names_out(['slope'])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Drop the original 'slope' column
heart_df = heart_df.drop(columns=['slope'])

# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)

# Rename the new variables
heart_df = heart_df.rename(columns={"slope_1.0":"upsloping", "slope_2.0":"flat", "slope_3.0":"downsloping"})

# Display the updated DataFrame
print(heart_df.head())

    age  sex  trestbps   chol  fbs  restecg  thalach  exang  oldpeak   ca  \
0  63.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3  0.0   
1  67.0  1.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5  3.0   
2  67.0  1.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6  2.0   
3  37.0  1.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5  0.0   
4  41.0  0.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4  0.0   

   thal  heart_disease  cp: typical angina  cp: atypical angina  \
0   6.0              0                 1.0                  0.0   
1   3.0              1                 0.0                  0.0   
2   7.0              1                 0.0                  0.0   
3   3.0              0                 0.0                  0.0   
4   3.0              0                 0.0                  1.0   

   cp: non-anginal pain  cp: asymptomatic  upsloping  flat  downsloping  
0                   0.0               0.0        0.0   0.0          1.0  
1                   0.0               1.0        0.0   1.0          0.0  
2                   0.0               1.0        0.0   1.0          0.0  
3                   1.0               0.0        0.0   0.0          1.0  
4                   0.0               0.0        1.0   0.0          0.0

# OneHotEncoder for 'thal' feature
encoder = OneHotEncoder(sparse_output=False)

# Apply OneHotEncoder to the 'thal' column
encoded_features = encoder.fit_transform(heart_df[['thal']])
encoded_feature_names = encoder.get_feature_names_out(['thal'])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names,index=heart_df.index)

# Drop the original 'thal' column
heart_df = heart_df.drop(columns=['thal'])

# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)

# Rename the new variables
heart_df = heart_df.rename(columns={"thal_3.0":"thal: normal", "thal_6.0":"thal: fixed defect", "thal_7.0":"thal: reversable defect"})

# Display the updated DataFrame
print(heart_df.head())

    age  sex  trestbps   chol  fbs  restecg  thalach  exang  oldpeak   ca  \
0  63.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3  0.0   
1  67.0  1.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5  3.0   
2  67.0  1.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6  2.0   
3  37.0  1.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5  0.0   
4  41.0  0.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4  0.0   

   ...  cp: typical angina  cp: atypical angina  cp: non-anginal pain  \
0  ...                 1.0                  0.0                   0.0   
1  ...                 0.0                  0.0                   0.0   
2  ...                 0.0                  0.0                   0.0   
3  ...                 0.0                  0.0                   1.0   
4  ...                 0.0                  1.0                   0.0   

   cp: asymptomatic  upsloping  flat  downsloping  thal: normal  \
0               0.0        0.0   0.0          1.0           0.0   
1               1.0        0.0   1.0          0.0           1.0   
2               1.0        0.0   1.0          0.0           0.0   
3               0.0        0.0   0.0          1.0           1.0   
4               0.0        1.0   0.0          0.0           1.0   

   thal: fixed defect  thal: reversable defect  
0                 1.0                      0.0  
1                 0.0                      0.0  
2                 0.0                      1.0  
3                 0.0                      0.0  
4                 0.0                      0.0  

[5 rows x 21 columns]

# OneHotEncoder for 'restecg' feature
encoder = OneHotEncoder(sparse_output=False)

# Apply OneHotEncoder to the 'restecg' column
encoded_features = encoder.fit_transform(heart_df[['restecg']])
encoded_feature_names = encoder.get_feature_names_out(['restecg'])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names,index=heart_df.index)

# Drop the original 'restecg' column
heart_df = heart_df.drop(columns=['restecg'])

# Concatenate the original DataFrame with the encoded DataFrame
heart_df = pd.concat([heart_df, encoded_df], axis=1)

# Rename the new variables
heart_df = heart_df.rename(columns={"restecg_0.0":"ecg: normal", "restecg_1.0":"ecg: ST-T wave abnormality", "restecg_2.0":"ecg: left ventricular hypertrophy"})

# Display the updated DataFrame
print(heart_df.head())

    age  sex  trestbps   chol  fbs  thalach  exang  oldpeak   ca  \
0  63.0  1.0     145.0  233.0  1.0    150.0    0.0      2.3  0.0   
1  67.0  1.0     160.0  286.0  0.0    108.0    1.0      1.5  3.0   
2  67.0  1.0     120.0  229.0  0.0    129.0    1.0      2.6  2.0   
3  37.0  1.0     130.0  250.0  0.0    187.0    0.0      3.5  0.0   
4  41.0  0.0     130.0  204.0  0.0    172.0    0.0      1.4  0.0   

   heart_disease  ...  cp: asymptomatic  upsloping  flat  downsloping  \
0              0  ...               0.0        0.0   0.0          1.0   
1              1  ...               1.0        0.0   1.0          0.0   
2              1  ...               1.0        0.0   1.0          0.0   
3              0  ...               0.0        0.0   0.0          1.0   
4              0  ...               0.0        1.0   0.0          0.0   

   thal: normal  thal: fixed defect  thal: reversable defect  ecg: normal  \
0           0.0                 1.0                      0.0          0.0   
1           1.0                 0.0                      0.0          0.0   
2           0.0                 0.0                      1.0          0.0   
3           1.0                 0.0                      0.0          1.0   
4           1.0                 0.0                      0.0          0.0   

   ecg: ST-T wave abnormality  ecg: left ventricular hypertrophy  
0                         0.0                                1.0  
1                         0.0                                1.0  
2                         0.0                                1.0  
3                         0.0                                0.0  
4                         0.0                                1.0  

[5 rows x 23 columns]

# Looking for Outliers and investigating the data set.
heart_df.describe()[['age','trestbps','chol','thalach','oldpeak']]

# Checking chol for outliers
import plotly.express as px
fig = px.box(heart_df,y="chol")
fig.show()

# Dropping most extreme value
heart_df = heart_df[heart_df['chol'] != 564]
heart_df = heart_df.reset_index()

# Checking oldpeak for outliers
import plotly.express as px
fig = px.box(heart_df,y="oldpeak")
fig.show()

# Dropping most extreme values
heart_df = heart_df[heart_df['oldpeak'] != 6.2]
heart_df = heart_df[heart_df['oldpeak'] != 5.6]

# Min-max scaling the numerical features
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()

heart_df['age'] = min_max.fit_transform(heart_df[['age']])
heart_df['trestbps'] = min_max.fit_transform(heart_df[['trestbps']])
heart_df['chol'] = min_max.fit_transform(heart_df[['chol']])
heart_df['thalach'] = min_max.fit_transform(heart_df[['thalach']])
heart_df['oldpeak'] = min_max.fit_transform(heart_df[['oldpeak']])
heart_df = heart_df.drop("index", axis=1)
heart_df.head()

import matplotlib.pyplot as plt
import seaborn as sns

# Histograms for numerical features
heart_df.hist(bins=20, figsize=(14, 10))
plt.show()

# Feature Selection tool

def auto_feature_selector(dataset_path, methods=[]):
    # uses methods below to find the best features through aggregating the results of the 
    # dataset_path - a pandas dataframe
    # methods - a list of various feature selection methods we outlined before, use them all here (list)

    # preprocessing
    X, y, num_feats = preprocess_dataset(dataset_path)

    # creating lists and setting them to False

    feature_name = list(X.columns)
    cor_support = [False for i in range(0, len(feature_name))]
    chi_support = [False for i in range(0, len(feature_name))]
    rfe_support = [False for i in range(0, len(feature_name))]
    embedded_lr_support = [False for i in range(0, len(feature_name))]
    embedded_rf_support = [False for i in range(0, len(feature_name))]

    # Run every method we outlined above from the methods list and collect returned best features from every method
    if 'pearson' in methods:
        cor_support, cor_feature = cor_selector(X, y, num_feats)
    if 'chi-square' in methods:
        chi_support, chi_feature = chi_squared_selector(X, y, num_feats)
    if 'rfe' in methods:
        rfe_support, rfe_feature = rfe_selector(X, y, num_feats)
    if 'log-reg' in methods:
        embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
    if 'rf' in methods:
        embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)

    # Combine all the above feature list and count the maximum set of features that got selected by all methods

    feature_selection_df = pd.DataFrame(
        {'Feature': feature_name, 'Pearson': cor_support, 'Chi-2': chi_support, 'RFE': rfe_support,
         'Logistics': embedded_lr_support,
         'Random Forest': embedded_rf_support})
    # count the selected times for each feature
    feature_selection_df['Total'] = np.sum(feature_selection_df.drop('Feature', axis=1), axis=1)
    # display the top features
    feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'], ascending=False)
    feature_selection_df.index = range(1, len(feature_selection_df) + 1)
    best_features = feature_selection_df['Feature'][0:num_feats]
    
    return best_features

def preprocess_dataset(dataset):
    # splitting data set and picking number of features
    X = heart_df.drop(['heart_disease'],axis=1)
    y = heart_df['heart_disease']
    num_feats = 9
    return X, y, num_feats

# Method that finds best features using pearson correlation
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    
    return cor_support, cor_feature

# Method that finds best features using chi-squared
def chi_squared_selector(X, y, num_feats):
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    from sklearn.preprocessing import MinMaxScaler
    
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector =SelectKBest(chi2,k=num_feats)
    chi_selector.fit(X_norm,y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    
    return chi_support, chi_feature

# Method that finds best features using rfe

def rfe_selector(X, y, num_feats):
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import MinMaxScaler
    
    X_norm= MinMaxScaler().fit_transform(X)
    rfe_selector = RFE(estimator=LogisticRegression(),
                      n_features_to_select= num_feats,
                      step=10,verbose=0)
    rfe_selector.fit(X_norm,y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    
    return rfe_support, rfe_feature

# Method that finds best features using logistic regression

def embedded_log_reg_selector(X, y, num_feats):
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import MinMaxScaler
    
    X_norm = MinMaxScaler().fit_transform(X)
    embedded_lr_selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=0),
                                           max_features=num_feats)
    embedded_lr_selector.fit(X_norm, y)
    embedded_lr_support = embedded_lr_selector.get_support()
    embedded_lr_feature = X.loc[:, embedded_lr_support].columns.tolist()

    return embedded_lr_support, embedded_lr_feature

# Method that finds best features using Random Forest

def embedded_rf_selector(X, y, num_feats):
    from sklearn.feature_selection import SelectFromModel
    from sklearn.ensemble import RandomForestClassifier
    
    embedded_rf_select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0),
                                         max_features=num_feats)
    embedded_rf_select.fit(X, y)

    embedded_rf_support = embedded_rf_select.get_support()
    embedded_rf_feature = X.loc[:, embedded_rf_support].columns.tolist()

    return embedded_rf_support, embedded_rf_feature



methods = ['pearson', 'chi-square', 'rfe', 'log-reg', 'rf']
best_features = auto_feature_selector(dataset_path=heart_df,methods=methods)
print("According to your selected methods, these are the best features: \n")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(best_features)

According to your selected methods, these are the best features: 

1           cp: asymptomatic
2                         ca
3                    thalach
4    thal: reversable defect
5               thal: normal
6                    oldpeak
7                      exang
8                  upsloping
9                   trestbps
Name: Feature, dtype: object

# Create test and train datasets containing selected features
X = heart_df[['cp: asymptomatic','ca','thalach','thal: reversable defect','thal: normal','oldpeak','exang','upsloping','trestbps']]
y = heart_df['heart_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Show the first few rows of the dataset
print(X_train.head()) 
heart_df.shape
heart_df.to_csv("heart_disease.csv",sep=",",index=False, encoding='utf-8')

# Importing necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import seaborn as sns


# Creating the model
logmodel = LogisticRegression(random_state=0,max_iter=10000).fit(X_train,y_train)

# Generating the predictions and creating the confusion matrix with the accuracy score
log_predictions = logmodel.predict(X_test)
train_predictions = logmodel.predict(X_train)
print("Recall (test set):",recall_score(y_test,log_predictions), 
      "\nAccuracy (train set):", accuracy_score(y_train,train_predictions),
      "\nAccuracy (test set):", accuracy_score(y_test,log_predictions),
      "\nPercision (test set):", precision_score(y_test,log_predictions),
      "\nF1 score (test set):", f1_score(y_test,log_predictions),
      "\nConfusion Matrix (test set):\n",confusion_matrix(y_test,log_predictions,labels=[1,0]).transpose())

# Visualize Confusion Matrix
cm = confusion_matrix(y_test,log_predictions,labels=[1,0]).transpose()

# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Baseline Logistic Regression Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Creating a dictionary containing all hyperparameter values to consider.
grid_values = {'C': [0.001,0.01,0.1,1,10,100,1000],'tol':[0.0001,0.001,0.01,0.1,1,10]}
grid = GridSearchCV(logmodel,grid_values,refit=True,verbose=0,scoring='accuracy')  # refits the estimator using the best found parameters
grid.fit(X_train,y_train)

# Getting the predicted values, as well as printing the accuracy and confusion matrix.
grid_predictions = grid.predict(X_test)
train_predictions = grid.predict(X_train)
print("Recall (test set):",recall_score(y_test,grid_predictions),
      "\nAccuracy (train set):", accuracy_score(y_train,train_predictions),
      "\nAccuracy (test set):", accuracy_score(y_test,grid_predictions),
      "\nPercision (test set):", precision_score(y_test,grid_predictions),
      "\nF1 score (test set):", f1_score(y_test,grid_predictions),
      "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
      "\nBest Parameters:", grid.best_params_)

# Note the recall of this model is worse than the orginal. This is acceptable because it was created using cross validation. 
# This means that though the data performs worse on the test set, it will likely perform better on average on new data points.

# Visualize Confusion Matrix
cm = confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose()

# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Logistic Regression Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Dropping least important variables and fitting a new model
dropped_X_train = X_train.drop(['trestbps','upsloping','exang'],axis=1)
dropped_X_test = X_test.drop(['trestbps','upsloping','exang'],axis=1)
grid.fit(dropped_X_train,y_train)

# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(dropped_X_test)
train_predictions = grid.predict(dropped_X_train)

                              
print("Recall (test set):",recall_score(y_test,grid_predictions), 
      "\nAccuracy (train set):", accuracy_score(y_train,train_predictions),
      "\nAccuracy (test set):", accuracy_score(y_test,grid_predictions),
      "\nPercision (test set):", precision_score(y_test,grid_predictions),
      "\nF1 score (test set):", f1_score(y_test,grid_predictions),
      "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
      "\nBest Parameters:", grid.best_params_)

Recall (test set): 0.8292682926829268 
Accuracy (train set): 0.8341463414634146 
Accuracy (test set): 0.8539325842696629 
Percision (test set): 0.85 
F1 score (test set): 0.8395061728395061 
Confusion Matrix:
 [[34  6]
 [ 7 42]] 
Best Parameters: {'C': 1, 'tol': 1}

# Visualize Confusion Matrix
cm = confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose()

# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Logistic Regression Model with Dropped Features')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Importing necessary libraries
from sklearn.tree import DecisionTreeClassifier

# Creating the decision tree and fitting it with default values
DT_clf = DecisionTreeClassifier(random_state=0)
DT_clf.fit(X_train,y_train)
DT_test_predictions = DT_clf.predict(X_test)
DT_train_predictions = DT_clf.predict(X_train)

# Print evaluation metrics
print("Recall (test set):", recall_score(y_test, DT_test_predictions))
print("Accuracy (train set):", accuracy_score(y_train, DT_train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, DT_test_predictions))
print("Precision (test set):", precision_score(y_test, DT_test_predictions))
print("F1 score (test set):", f1_score(y_test, DT_test_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, DT_test_predictions, labels=[1, 0]).transpose())

Recall (test set): 0.7317073170731707
Accuracy (train set): 1.0
Accuracy (test set): 0.7078651685393258
Precision (test set): 0.6666666666666666
F1 score (test set): 0.6976744186046511
Confusion Matrix (test set):
 [[30 15]
 [11 33]]

# Visualize Confusion Matrix
cm_DT = confusion_matrix(y_test,log_predictions,labels=[1,0]).transpose()

# Create DataFrame for confusion matrix
cm_df_DT = pd.DataFrame(cm_DT, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df_DT, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Decision Tree Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Creating a dictionary containing all hyperparameter values to consider
grid_values = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV to find the best hyperparameters
grid = GridSearchCV(DT_clf, grid_values, refit=True, verbose=0, scoring='accuracy')
grid.fit(X_train, y_train)

# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(X_test)
train_predictions = grid.predict(X_train)

print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_) 
# Note: The recall of this model might be worse than the original.
# This is acceptable because it was created using cross-validation.
# This means that though the data performs worse on the test set, it will likely perform better on average on new data points

Recall (test set): 0.7073170731707317
Accuracy (train set): 0.9170731707317074
Accuracy (test set): 0.7640449438202247
Precision (test set): 0.7631578947368421
F1 score (test set): 0.7341772151898733
Confusion Matrix (test set):
 [[29  9]
 [12 39]]
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}

# Visualize Confusion Matrix
cm_DT = confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose()

# Create DataFrame for confusion matrix
cm_df_DT = pd.DataFrame(cm_DT, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df_DT, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Decision Tree Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Dropping least important variables and fitting a new model
dropped_X_train = X_train.drop(['trestbps', 'upsloping', 'exang'], axis=1)
dropped_X_test = X_test.drop(['trestbps', 'upsloping', 'exang'], axis=1)
grid.fit(dropped_X_train, y_train)

# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(dropped_X_test)
train_predictions = grid.predict(dropped_X_train)

print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)

Recall (test set): 0.6829268292682927
Accuracy (train set): 0.8926829268292683
Accuracy (test set): 0.7528089887640449
Precision (test set): 0.7567567567567568
F1 score (test set): 0.7179487179487181
Confusion Matrix:
 [[28  9]
 [13 39]]
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}

# Confusion Matrix
cm = confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose()

# Create DataFrame for confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for Tuned Decision Tree Model with Dropped Features')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Importing Necessary Libraries
from sklearn.ensemble import RandomForestClassifier

# Creating the random forest and fitting it to the data.
RF_clf = RandomForestClassifier(random_state = 1)
RF_clf.fit(X_train,y_train)
RF_predictions = RF_clf.predict(X_test)

#Creating the confusion matrix and reporting recall
print("Recall score:",recall_score(y_test,RF_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,RF_predictions,labels=[1,0]).transpose())

Recall score: 0.6666666666666666 
Confusion Matrix:
 [[28  7]
 [14 41]]

# Creating a dictionary containing all hyperparameter values to consider.
grid_values = {
    'n_estimators': [100, 200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Creating the grid object itself, and fitting it to the data.
grid = GridSearchCV(RF_clf,grid_values,refit=True,verbose=0,scoring='recall')  # finds optimal hyperparameter values (optimizing recall)
grid.fit(X_train,y_train)

# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
     "\nBest Parameters:", grid.best_params_)

# Performs the same as the orginal model, but since it was created through cross validation, it will be more robust to new data.

Recall score: 0.7142857142857143 
Confusion Matrix:
 [[30  4]
 [12 44]] 
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}

# Dropping cholesterol level and fitting a new model
dropped_X_train = X_train.drop(['chol','cp','age'],axis=1)
dropped_X_test = X_test.drop(['chol','cp','age'],axis=1)
grid.fit(dropped_X_train,y_train)

# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(dropped_X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
     "\nBest Parameters:", grid.best_params_)

Recall score: 0.7380952380952381 
Confusion Matrix:
 [[31  7]
 [11 41]] 
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}

# Importing Necessary Libraries
from sklearn.linear_model import SGDClassifier

# Creating the SGD model
sgd_model = SGDClassifier(random_state=0, max_iter=10000).fit(X_train, y_train)

# Generating the predictions and creating the confusion matrix with the accuracy score
sgd_predictions = sgd_model.predict(X_test)
train_predictions = sgd_model.predict(X_train)

print("Recall (test set):", recall_score(y_test, sgd_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, sgd_predictions))
print("Precision (test set):", precision_score(y_test, sgd_predictions))
print("F1 score (test set):", f1_score(y_test, sgd_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, sgd_predictions, labels=[1, 0]).transpose())

Recall (test set): 0.8292682926829268
Accuracy (train set): 0.8536585365853658
Accuracy (test set): 0.8764044943820225
Precision (test set): 0.8947368421052632
F1 score (test set): 0.860759493670886
Confusion Matrix (test set):
 [[34  4]
 [ 7 44]]

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, sgd_predictions, labels=[1, 0]).transpose()
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for SGD Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Creating a dictionary containing all hyperparameter values to consider
grid_values = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'max_iter': [1000, 5000, 10000]
}

# Perform GridSearchCV to find the best hyperparameters
grid = GridSearchCV(sgd_model, grid_values, refit=True, verbose=0, scoring='accuracy')
grid.fit(X_train, y_train)

# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(X_test)
train_predictions = grid.predict(X_train)

print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)

Recall (test set): 0.8780487804878049
Accuracy (train set): 0.8146341463414634
Accuracy (test set): 0.8876404494382022
Precision (test set): 0.8780487804878049
F1 score (test set): 0.8780487804878049
Confusion Matrix:
 [[36  5]
 [ 5 43]]
Best Parameters: {'alpha': 0.001, 'max_iter': 1000, 'penalty': 'l2'}

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose()
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for SGD Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Perform GridSearchCV to find the best hyperparameters
grid = GridSearchCV(sgd_model, grid_values, refit=True, verbose=0, scoring='accuracy')
grid.fit(X_train, y_train)

# Dropping least important variables and fitting a new model
dropped_X_train = X_train.drop(['trestbps', 'upsloping', 'exang'], axis=1)
dropped_X_test = X_test.drop(['trestbps', 'upsloping', 'exang'], axis=1)
grid.fit(dropped_X_train, y_train)

# Getting the predicted values, as well as printing the accuracy and confusion matrix
grid_predictions = grid.predict(dropped_X_test)
train_predictions = grid.predict(dropped_X_train)

print("Recall (test set):", recall_score(y_test, grid_predictions))
print("Accuracy (train set):", accuracy_score(y_train, train_predictions))
print("Accuracy (test set):", accuracy_score(y_test, grid_predictions))
print("Precision (test set):", precision_score(y_test, grid_predictions))
print("F1 score (test set):", f1_score(y_test, grid_predictions))
print("Confusion Matrix (test set):\n", confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose())
print("Best Parameters:", grid.best_params_)

Recall (test set): 0.7560975609756098
Accuracy (train set): 0.8146341463414634
Accuracy (test set): 0.8314606741573034
Precision (test set): 0.8611111111111112
F1 score (test set): 0.8051948051948052
Confusion Matrix (test set):
 [[31  5]
 [10 43]]
Best Parameters: {'alpha': 0.01, 'max_iter': 1000, 'penalty': 'l2'}

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, grid_predictions, labels=[1, 0]).transpose()
cm_df = pd.DataFrame(cm, index=['Actual 1', 'Actual 0'], columns=['Predicted 1', 'Predicted 0'])

plt.figure(figsize=(10, 8))
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='d', annot_kws={"size": 24})
plt.title('Confusion Matrix for SGD Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Importing Necessary Libraries
from sklearn.svm import SVC

# Creating the random forest and fitting it to the data.
SVM_model = SVC(random_state=1)
SVM_model.fit(X_train,y_train)
SVM_predictions = SGD_model.predict(X_test)

#Creating the confusion matrix and reporting recall
print("Recall score:",recall_score(y_test,SVM_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,SVM_predictions,labels=[1,0]).transpose())

Recall score: 0.6904761904761905 
Confusion Matrix:
 [[29 11]
 [13 37]]

# Creating a dictionary containing all hyperparameter values to consider.
grid_values = {'C': [0.1,1,10,100,1000], 'gamma': [1,0.1,0.001,0.0001],'kernel':['rbf']}

# Creating the grid object itself, and fitting it to the data.
grid = GridSearchCV(SVM_model,grid_values,refit=True,verbose=0,scoring='recall')  # finds optimal hyperparameter values (optimizing recall)
grid.fit(X_train,y_train)

# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
     "\nBest Parameters:", grid.best_params_)

# Performs the same as the orginal model, but since it was created through cross validation, it will be more robust to new data.

Recall score: 0.7142857142857143 
Confusion Matrix:
 [[30  5]
 [12 43]] 
Best Parameters: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}

# Dropping cholesterol level and fitting a new model
dropped_X_train = X_train.drop(['chol','cp','age'],axis=1)
dropped_X_test = X_test.drop(['chol','cp','age'],axis=1)
grid.fit(dropped_X_train,y_train)

# Fitting with best parameters, printing confusion matrix and recall
grid_predictions = grid.predict(dropped_X_test)
print("Recall score:",recall_score(y_test,grid_predictions), "\nConfusion Matrix:\n",confusion_matrix(y_test,grid_predictions,labels=[1,0]).transpose(),
     "\nBest Parameters:", grid.best_params_)

Recall score: 0.7380952380952381 
Confusion Matrix:
 [[31  6]
 [11 42]] 
Best Parameters: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}

	age	trestbps	chol	thalach	oldpeak
count	297.000000	297.000000	297.000000	297.000000	297.000000
mean	54.542088	131.693603	247.350168	149.599327	1.055556
std	9.049736	17.762806	51.997583	22.941562	1.166123
min	29.000000	94.000000	126.000000	71.000000	0.000000
25%	48.000000	120.000000	211.000000	133.000000	0.000000
50%	56.000000	130.000000	243.000000	153.000000	0.800000
75%	61.000000	140.000000	276.000000	166.000000	1.600000
max	77.000000	200.000000	564.000000	202.000000	6.200000

Predicting Heart Disease Using Fundemental Machine Learning Algorithms¶

Data Loading and Cleaning¶

Logistic Regression¶

Creating Basic Model¶

Tuning Hyperparameters¶

Dropping Features and Revaluating¶

Decision Tree¶

Creating Basic Model¶

Tuning Hyperparameters¶

Dropping Features and Revaluating¶

Random Forest¶

Creating Basic Model¶

Tuning Hyperparameters¶

Dropping Features and Revaluating¶

Stochastic Gradient Descent¶

Creating Basic Model¶

Tuning Hyperparameters¶

Dropping Features and Revaluating¶

Support Vector Machine¶

Creating Basic Model¶

Tuning Hyperparameters¶

Dropping Features and Revaluating¶

	age	sex	trestbps	chol	fbs	thalach	exang	oldpeak	ca	heart_disease	...	cp: asymptomatic	upsloping	flat	downsloping	thal: normal	thal: fixed defect	thal: reversable defect	ecg: normal	ecg: left ventricular hypertrophy
0	0.708333	1.0	0.481132	0.367698	1.0	0.603053	0.0	0.522727	0.0	0	...	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0
1	0.791667	1.0	0.622642	0.549828	0.0	0.282443	1.0	0.340909	3.0	1	...	1.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	1.0
2	0.791667	1.0	0.245283	0.353952	0.0	0.442748	1.0	0.590909	2.0	1	...	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0
3	0.166667	1.0	0.339623	0.426117	0.0	0.885496	0.0	0.795455	0.0	0	...	0.0	0.0	0.0	1.0	1.0	0.0	0.0	1.0	0.0
4	0.250000	0.0	0.339623	0.268041	0.0	0.770992	0.0	0.318182	0.0	0	...	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0