SDA_2021_HeartFailure: A Jupyter Notebook repository from LvB

SDA_2021_HeartFailure

Name of QuantLet: SDA_2021_HeartFailure

Published in: SDA_2021_St_Gallen

Description: 'This project aims to predict the death of patients suffering from heart disease. In this way, it might be possible to adapt the treatments and maybe avoid heart failures in some cases.'

Keywords: 'heart failure, death prediction, EDA, logistic regression, Decision classification tree, Random Forest Feature Selection, Test Accuracy'

Authors: 'Ozokcu Arzu, Therry Leonore'

Submitted: '07.12.2021'

Additional Info: 'This repository look at the feastures that explained the most the death of a patient suffering from heart disease. First there is an explanatory data analysis with distribution of the features and their correlation. Then, different models have been implemented : logistic regression, Decision classification tree, Random Forest Feature Selection and Test Accuracy'

PYTHON Code

#!/usr/bin/env python
# coding: utf-8

# In[1]:


#logistic regression

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import preprocessing

from sklearn.metrics import accuracy_score as acc_rate

from matplotlib import pyplot as plt

import seaborn as sns

from sklearn.datasets import load_digits

from sklearn.tree import plot_tree

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import seaborn as sns
import sklearn

import plotly as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score, classification_report, roc_curve,precision_recall_curve, auc,confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix

data = pd.read_csv("/Users/leonoretherry/Documents/St Gallen M2/smart data analytics/heart_failure_clinical_records_dataset.csv")

x = data.loc[:,:"time"]
y = data.loc[:,["DEATH_EVENT"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 2)


# In[2]:


#let's have an overview of our data set 
data.head()


# In[3]:


#Number and percentage of participants suffering from heart diseases in the study that died

data['DEATH_EVENT'].value_counts()

data['DEATH_EVENT'].value_counts(normalize=True)*100

#67% of the patient followed during the study did not die while 32 did


# In[4]:


data.isnull().sum()
#there are no missing values in our data set


# In[5]:



#overview of our explicative variables
hist = data.hist(figsize=(10,9))

plt.savefig("pandas_hist_01.png", bbox_inches='tight', dpi=100)


# In[6]:


#Let's remove the time component of our data set as it is not very representative for explaining the death event as it corresponds to the time period where the patient were followed by the study 

data2 = data.drop(['time'], axis = 1)

x2 = data.loc[:,:"smoking"]
y2 = data.loc[:,["DEATH_EVENT"]]

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size = 0.33, random_state = 2)

plt.figure(figsize=(15,8))
sns.heatmap(data2.corr(), annot=True)

#time, ejection fraction and serum creatinine are the more correlated variables with the death event
#A lower level of ejection fracion increase the chance to die while a higher level of serum creatinine increase the chance to die
#the time is negatively correlated with the death. It can be explained by the fact that the less time the patient has been followed by the study, the less chances he would have die. However it is possible that the patient die just after the end of the study


# In[7]:


#logistic regression

from sklearn.linear_model import LogisticRegression

scaler = preprocessing.StandardScaler().fit(x2_train)
x2_train_scaled = scaler.transform(x2_train)

model = LogisticRegression()

model.fit(x2_train_scaled, y2_train)

import statsmodels.api as sm
logit_model=sm.Logit(y2,x2)
result=logit_model.fit()
print(result.summary())

#time, ejection fraction and serum creatinine are once again the variables that have the greatest impact


# In[8]:


from sklearn import tree

#we kept max_depth=2 as the Accuracy rate was better with 2 than 3 or 4
clf2 = tree.DecisionTreeClassifier(criterion='entropy',
                                  max_depth=2)
clf2.fit(X=x2_train, y=y2_train)

# make prediction
print('\nThe target test data set is:\n', y2_test)
print('\nThe predicted result is:\n', clf2.predict(x2_test))
print('\nAccuracy rate is:\n', acc_rate(y2_test, clf2.predict(x2_test)))

plt.figure(figsize=(25,10))
c=plot_tree(clf2, 
            feature_names= x2.columns,
            class_names=['0','1'],
            filled=True, 
            rounded=True, 
            fontsize=14)
plt.savefig("decisiontreeOptimizedwithouttime.png")


# In[9]:


y2_pred=clf2.predict(x2_test)

print(confusion_matrix(y2_test,y2_pred))
target_names=['class 0', 'class 1']
print(classification_report(y2_test, y2_pred, target_names=target_names))

#the model better predict when the patient is going to live rather going to die 


# In[ ]:





# In[10]:


#Distribution of death event according to gender
len_data = len(data)
len_w = len(data[data["sex"]==0])
len_m = len_data - len_w

men_died = len(data.loc[(data["DEATH_EVENT"]==1) &(data['sex']==0)])
men_survived = len_m - men_died

women_died = len(data.loc[(data["DEATH_EVENT"]==1) & (data['sex']==1)])
women_survived = len_w - women_died

labels = ['Men died','Men survived','Women died','Women survived']
values = [men_died, men_survived, women_died, women_survived]

fig = go.Figure(data=[go.Pie(labels=labels, values=values,textinfo='label+percent',hole=0.4)])
fig.update_layout(
    title_text="Distribution of DEATH EVENT according to their gender")
fig.show()


# In[11]:


# Age distribution plot
fg=sns.FacetGrid(data, hue="DEATH_EVENT", height=6,)
fg.map(sns.kdeplot, "age",shade=True).add_legend(labels=["Alive","Not alive"])
plt.title('Age Distribution Plot');
plt.show()


# In[12]:


# Death event as per diabetes
pd.crosstab(data.diabetes ,data.DEATH_EVENT).plot(kind='bar')
plt.legend(title='DEATH_EVENT', loc='upper right', labels=['No death event', 'Death event'])
plt.title('Death Event as per diabetes ')
plt.xlabel('diabetes ')
plt.ylabel('# Death')
plt.show()


# In[13]:


# Death event as per high pressure blood
pd.crosstab(data.high_blood_pressure ,data.DEATH_EVENT).plot(kind='bar')
plt.legend(title='DEATH_EVENT', loc='upper right', labels=['Not alive', 'Alive'])
plt.title('Death Event as per High pressure blood ')
plt.xlabel('High pressure blood ')
plt.ylabel('# Death')
plt.show()


# In[14]:


#Death event as per smokers
pd.crosstab(data.smoking ,data.DEATH_EVENT).plot(kind='bar')
plt.legend(title='DEATH_EVENT', loc='upper right', labels=['Not alive', 'Alive'])
plt.title('Death Event as per smokers ')
plt.xlabel('Smokers ')
plt.ylabel('# Death')
plt.show()


# In[15]:


#Distribution of diabetics according to their gender
len_data = len(data)
len_w = len(data[data["sex"]==0])
len_m = len_data - len_w

men_with_diabetes = len(data.loc[(data["diabetes"]==1) & (data['sex']==1)])
men_without_diabetes = len_m - men_with_diabetes

women_with_diabetes = len(data.loc[(data["diabetes"]==1) & (data['sex']==0)])
women_without_diabetes = len_w - women_with_diabetes
labels = ['M_diabetes','M_no_diabete','W_diabete','W_no_diabete']
values = [men_with_diabetes, men_without_diabetes, women_with_diabetes, women_without_diabetes]

fig = go.Figure(data=[go.Pie(labels=labels, values=values,textinfo='label+percent',hole=0.4)])
fig.update_layout(
    title_text="Distribution of No/diabetics according to their gender. (M for Men, W for Women)")
fig.show()


# In[16]:


#Feature Selection according to their importance
x = data.copy()
y = x.loc[:,["DEATH_EVENT"]]
x = x.drop(columns=['time','DEATH_EVENT'])
features_names = x.columns
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(x, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(x.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


# In[17]:


plt.figure()
plt.title("Feature importances")
sns.barplot(x=features_names[indices].to_numpy(), y=importances[indices], palette="deep",yerr=std[indices])
plt.xticks(range(x.shape[1]), features_names[indices].to_numpy(),rotation=80)
plt.xlim([-1, x.shape[1]])
plt.show()


# In[18]:


def plot_cm(cm,title):
    z = cm
    x = ['No death Event', 'Death Event']
    y = x
    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='deep')

    # add title
    fig.update_layout(title_text='<i><b>Confusion matrix {}</b></i>'.format(title),
                      #xaxis = dict(title='x'),
                      #yaxis = dict(title='x')
                     )

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=0.5,
                            y=-0.10,
                            showarrow=False,
                            text="Predicted value",
                            xref="paper",
                            yref="paper"))

    # add custom yaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=-0.15,
                            y=0.5,
                            showarrow=False,
                            text="Real value",
                            textangle=-90,
                            xref="paper",
                            yref="paper"))

    # adjust margins to make room for yaxis title
    fig.update_layout(margin=dict(t=50, l=20),width=750,height=750)

    # add colorbar
    fig['data'][0]['showscale'] = True
    fig.show()


# In[19]:


# Testing our models' accuracy rate

models= [['Logistic Regression ',LogisticRegression()],
        ['KNearest Neighbor ',KNeighborsClassifier()],
        ['Decision Tree Classifier ',DecisionTreeClassifier()],
        ['SVM ',SVC()]]

x = data.loc[:,:"time"]
y = data.loc[:,["DEATH_EVENT"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 2)

models_score = []
for (name,model) in models:
        model = model
        model.fit(x_train,y_train)
        model_pred = model.predict(x_test)
        cm_model = confusion_matrix(y_test, model_pred)
        models_score.append(accuracy_score(y_test,model.predict(x_test)))

        print(name)
        print('Validation Acuuracy: ',accuracy_score(y_test,model.predict(x_test)))
        print('Training Accuracy: ',accuracy_score(y_train,model.predict(x_train)))
        print('############################################')
        plot_cm(cm_model,title=name+"model")


# In[ ]:

automatically created on 2023-02-18

QuantLet/SDA_2021_HeartFailure

SDA_2021_HeartFailure

PYTHON Code