#set working directory
setwd = ("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/W12 - Supervised ML in Python")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
14 features(Columns) including the target:
slope_of_peak_exercise_st_segment (type: int): the slope of the peak exercise ST segment, an electrocardiography read out indicating quality of blood flow to the heart
thal (type: categorical): results of thallium stress test measuring blood flow to the heart, with possible values normal, fixed_defect, reversible_defect
resting_blood_pressure (type: int): resting blood pressure
chest_pain_type (type: int): chest pain type (4 values)
num_major_vessels (type: int): number of major vessels (0-3) colored by flourosopy
fasting_blood_sugar_gt_120_mg_per_dl (type: binary): fasting blood sugar > 120 mg/dl
resting_ekg_results (type: int): resting electrocardiographic results (values 0,1,2)
serum_cholesterol_mg_per_dl (type: int): serum cholestoral in mg/dl
oldpeak_eq_st_depression (type: float): oldpeak = ST depression induced by exercise relative to rest, a measure of abnormality in electrocardiograms
sex (type: binary): 0: female, 1: male
age (type: int): age in years
max_heart_rate_achieved (type: int): maximum heart rate achieved (beats per minute)
exercise_induced_angina (type: binary): exercise-induced chest pain (0: False, 1: True)
df = pd.read_csv("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/DSB W12 - Supervised ML in Python/heart.csv")
df.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
df.tail()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
298 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
299 | 45 | 1 | 3 | 110 | 264 | 0 | 1 | 132 | 0 | 1.2 | 1 | 0 | 3 | 0 |
300 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
df
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
299 | 45 | 1 | 3 | 110 | 264 | 0 | 1 | 132 | 0 | 1.2 | 1 | 0 | 3 | 0 |
300 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
303 rows × 14 columns
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows :: ", duplicate_rows.shape)
Number of duplicate rows :: (1, 14)
#we have one duplicate row.
#Removing the duplicate row
df = df.drop_duplicates()
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows :: ", duplicate_rows.shape)
#Number of duplicate rows after dropping one duplicate row
Number of duplicate rows :: (0, 14)
Now, there are 0 duplicate rows in the data. We will check for ‘null’ values in the data.
df.isnull().sum()
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 target 0 dtype: int64
There are no null values in this dataset
print(df.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 302 entries, 0 to 302 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 302 non-null int64 1 sex 302 non-null int64 2 cp 302 non-null int64 3 trestbps 302 non-null int64 4 chol 302 non-null int64 5 fbs 302 non-null int64 6 restecg 302 non-null int64 7 thalach 302 non-null int64 8 exang 302 non-null int64 9 oldpeak 302 non-null float64 10 slope 302 non-null int64 11 ca 302 non-null int64 12 thal 302 non-null int64 13 target 302 non-null int64 dtypes: float64(1), int64(13) memory usage: 35.4 KB None
No null values in the dataset.
The data type is numeric for all the features, this implies that we do not have to change the data into dummy variables or apply one hot encoding etc. before applying any algorithm.
sns.boxplot(x=df['age'])
sns.boxplot(x=df['sex'])
sns.boxplot(x=df['cp'])
sns.boxplot(x=df['trestbps']) #Some outliers are observed in 'trtbps'. They will be removed later
sns.boxplot(x=df['chol']) #Some outliers are observed in 'chol'. They will be removed later
sns.boxplot(x=df['fbs'])
sns.boxplot(x=df['restecg'])
sns.boxplot(x=df['thalach']) #Outliers present in thalachh
sns.boxplot(x=df['exang'])
sns.boxplot(x=df['oldpeak']) #Outliers are present in 'OldPeak'
sns.boxplot(x=df['slope'])
sns.boxplot(x=df['ca']) #Outliers are present in 'ca'
sns.boxplot(x=df['thal'])
<AxesSubplot:xlabel='thal'>
The Outliers are removed using two methods,
Inter-Quartile Range and
Z-score
#Find the InterQuartile Range
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3-Q1
print('*********** InterQuartile Range ***********')
print(IQR)
# Remove the outliers using IQR
df1 = df[~((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR))).any(axis=1)]
df1.shape
*********** InterQuartile Range *********** age 13.00 sex 1.00 cp 2.00 trestbps 20.00 chol 63.75 fbs 0.00 restecg 1.00 thalach 32.75 exang 1.00 oldpeak 1.60 slope 1.00 ca 1.00 thal 1.00 target 1.00 dtype: float64
(228, 14)
After removing outliers using IQR, the data contains 227 records.
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True, cmap='terrain')
<AxesSubplot:>
There is a positive correlation between target and cp, thalach, slope and a Negative correlation between target and sex, exang, ca, thal, oldpeak
sns.pairplot(data=df1)
<seaborn.axisgrid.PairGrid at 0x2118bb088b0>
sns.pairplot(data=df1, kind="hist")
<seaborn.axisgrid.PairGrid at 0x21192100be0>
To visualize the relationship between different features and figure out any linear relation between them we use PAIRPLOTS.
# with Histograms we can see the shape of each feature and provides the count of number of observations in each bin.
df1.hist(figsize=(12,12), layout=(5,3));
df1.plot(kind='box', subplots=True, layout=(5,3), figsize=(12,12))
plt.show()
Box and Whiskers plot are useful to find out outliers in our data. If we have more outliers we will have to remove them or fix them otherwise they will become as noise for the training data.
sns.catplot(data=df1, x='sex', y='age', hue='target', palette='husl')
<seaborn.axisgrid.FacetGrid at 0x2119dd9e850>
sns.barplot(data=df1, x='sex', y='chol', hue='target', palette='spring')
<AxesSubplot:xlabel='sex', ylabel='chol'>
df1['sex'].value_counts()
1 154 0 74 Name: sex, dtype: int64
There are 154 males and 74 females in our data
df1['cp'].value_counts()
0 108 2 61 1 42 3 17 Name: cp, dtype: int64
There 4 values of chest pain, ranges from 0 to 3
sns.countplot(x='cp', hue='target', data=df, palette='rocket')
<AxesSubplot:xlabel='cp', ylabel='count'>
gen = pd.crosstab(df1['sex'], df['target'])
print(gen)
target 0 1 sex 0 12 62 1 84 70
df1.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
5 | 57 | 1 | 0 | 140 | 192 | 0 | 1 | 148 | 0 | 0.4 | 1 | 0 | 1 | 1 |
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
StandardScaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[columns_to_scale] = StandardScaler.fit_transform(df[columns_to_scale])
<ipython-input-25-9c6c969fdeb6>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[columns_to_scale] = StandardScaler.fit_transform(df[columns_to_scale]) C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:1736: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy isetter(loc, value[:, i].tolist())
AFTER SCALING DATA:
df1.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
5 | 57 | 1 | 0 | 140 | 192 | 0 | 1 | 148 | 0 | 0.4 | 1 | 0 | 1 | 1 |
#We will divide our dataset into training data and test data. 70% of the data were used as training and the remaining 30% will be used for testing.
x = df1.drop("target", axis=1)
y = df1["target"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
#check sample size
print('x_train', x_train.size)
print('x_test', x_test.size)
print('y_train', y_train.size)
print('y_test', y_test.size)
x_train 2067 x_test 897 y_train 159 y_test 69
Applying the machine learning algorithms for predictive modelling.
#Building classification models
names = ['Age', 'Sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']
# ****************Logistic Regression*****************
logReg = LogisticRegression(random_state=0, solver='liblinear')
logReg.fit(x_train, y_train)
#Check accuracy of Logistic Regression
y_pred_logReg = logReg.predict(x_test)
#Model Accuracy
print("Accuracy of logistic regression classifier :: " ,metrics.accuracy_score(y_test,y_pred_logReg))
#Removing the features with low correlation and checking effect on accuracy of model
x_train1 = x_train.drop("fbs",axis=1)
x_train1 = x_train1.drop("trestbps", axis=1)
x_train1 = x_train1.drop("chol", axis=1)
x_train1 = x_train1.drop("restecg", axis=1)
x_test1 = x_test.drop("fbs", axis=1)
x_test1 = x_test1.drop("trestbps", axis=1)
x_test1 = x_test1.drop("chol", axis=1)
x_test1 = x_test1.drop("restecg", axis=1)
logReg1 = LogisticRegression(random_state=0, solver='liblinear').fit(x_train1,y_train)
y_pred_logReg1 = logReg1.predict(x_test1)
print("nAccuracy of logistic regression classifier after removing features:: " ,metrics.accuracy_score(y_test,y_pred_logReg1))
Accuracy of logistic regression classifier :: 0.855072463768116 nAccuracy of logistic regression classifier after removing features:: 0.8695652173913043
# ***********************Decision Tree Classification***********************
decTree = DecisionTreeClassifier(max_depth=6, random_state=0)
decTree.fit(x_train,y_train)
y_pred_decTree = decTree.predict(x_test)
print("Accuracy of Decision Trees :: " , metrics.accuracy_score(y_test,y_pred_decTree))
#Remove features which have low correlation with output (fbs, trtbps, chol)
x_train_dt = x_train.drop("fbs",axis=1)
x_train_dt = x_train_dt.drop("trestbps", axis=1)
x_train_dt = x_train_dt.drop("chol", axis=1)
x_train_dt = x_train_dt.drop("age", axis=1)
x_train_dt = x_train_dt.drop("sex", axis=1)
x_test_dt = x_test.drop("fbs", axis=1)
x_test_dt = x_test_dt.drop("trestbps", axis=1)
x_test_dt = x_test_dt.drop("chol", axis=1)
x_test_dt = x_test_dt.drop("age", axis=1)
x_test_dt = x_test_dt.drop("sex", axis=1)
decTree1 = DecisionTreeClassifier(max_depth=6, random_state=0)
decTree1.fit(x_train_dt, y_train)
y_pred_dt1 = decTree1.predict(x_test_dt)
print("Accuracy of decision Tree after removing features:: ", metrics.accuracy_score(y_test,y_pred_dt1))
Accuracy of Decision Trees :: 0.7391304347826086 Accuracy of decision Tree after removing features:: 0.7681159420289855
# Using Random forest classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print("Accuracy of Random Forest Classifier :: ", metrics.accuracy_score(y_test, y_pred_rf))
#Find the score of each feature in model and drop the features with low scores
f_imp = rf.feature_importances_
for i,v in enumerate(f_imp):
print('Feature: %s, Score: %.5f' % (names[i],v))
Accuracy of Random Forest Classifier :: 0.7971014492753623 Feature: Age, Score: 0.11355 Feature: Sex, Score: 0.03915 Feature: cp, Score: 0.11459 Feature: trestbps, Score: 0.07066 Feature: chol, Score: 0.07456 Feature: fbs, Score: 0.00000 Feature: restecg, Score: 0.01775 Feature: thalachh, Score: 0.11485 Feature: exng, Score: 0.02684 Feature: oldpeak, Score: 0.12835 Feature: slp, Score: 0.03607 Feature: caa, Score: 0.11449 Feature: thall, Score: 0.14914
#K Neighbours Classifier
knc = KNeighborsClassifier()
knc.fit(x_train,y_train)
y_pred_knc = knc.predict(x_test)
print("Accuracy of K-Neighbours classifier :: ", metrics.accuracy_score(y_test,y_pred_knc))
Accuracy of K-Neighbours classifier :: 0.6231884057971014
#Models and their accuracy
print("*****************Models and their accuracy*****************")
print("Logistic Regression Classifier :: ", metrics.accuracy_score(y_test,y_pred_logReg1))
print("Decision Tree :: ", metrics.accuracy_score(y_test,y_pred_dt1))
print("Random Forest Classifier :: ", metrics.accuracy_score(y_test, y_pred_rf))
print("K Neighbours Classifier :: ", metrics.accuracy_score(y_test,y_pred_knc))
*****************Models and their accuracy***************** Logistic Regression Classifier :: 0.8695652173913043 Decision Tree :: 0.7681159420289855 Random Forest Classifier :: 0.7971014492753623 K Neighbours Classifier :: 0.6231884057971014
Create confusion matrix with the prediction from the model with the highest accuracy.
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_logReg1)
cm
array([[17, 3], [ 6, 43]], dtype=int64)
sns.heatmap(cm, annot=True, cmap='BuPu')
<AxesSubplot:>
TP = cm[0][0]
TN = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
print('Testing accuracy:', (TP+TN) / (TP+TN+FN+FP))
Testing accuracy: 0.8695652173913043
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_logReg1))
precision recall f1-score support 0 0.74 0.85 0.79 20 1 0.93 0.88 0.91 49 accuracy 0.87 69 macro avg 0.84 0.86 0.85 69 weighted avg 0.88 0.87 0.87 69