#set working directory
setwd = ("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/W12 - Supervised ML in Python")


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


df = pd.read_csv("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/DSB W12 - Supervised ML in Python/heart.csv")


df.head()


df.tail()

df


duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows :: ", duplicate_rows.shape)

Number of duplicate rows ::  (1, 14)


#we have one duplicate row.
#Removing the duplicate row
df = df.drop_duplicates()
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows :: ", duplicate_rows.shape)
#Number of duplicate rows after dropping one duplicate row

Number of duplicate rows ::  (0, 14)


df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.4 KB
None


sns.boxplot(x=df['age'])
sns.boxplot(x=df['sex'])
sns.boxplot(x=df['cp'])
sns.boxplot(x=df['trestbps']) #Some outliers are observed in 'trtbps'. They will be removed later
sns.boxplot(x=df['chol']) #Some outliers are observed in 'chol'. They will be removed later
sns.boxplot(x=df['fbs'])
sns.boxplot(x=df['restecg'])
sns.boxplot(x=df['thalach']) #Outliers present in thalachh
sns.boxplot(x=df['exang'])
sns.boxplot(x=df['oldpeak']) #Outliers are present in 'OldPeak'
sns.boxplot(x=df['slope'])
sns.boxplot(x=df['ca']) #Outliers are present in 'ca'
sns.boxplot(x=df['thal'])

<AxesSubplot:xlabel='thal'>


#Find the InterQuartile Range
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3-Q1
print('*********** InterQuartile Range ***********')
print(IQR)

# Remove the outliers using IQR
df1 = df[~((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR))).any(axis=1)]
df1.shape

*********** InterQuartile Range ***********
age         13.00
sex          1.00
cp           2.00
trestbps    20.00
chol        63.75
fbs          0.00
restecg      1.00
thalach     32.75
exang        1.00
oldpeak      1.60
slope        1.00
ca           1.00
thal         1.00
target       1.00
dtype: float64

(228, 14)


plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True, cmap='terrain')

<AxesSubplot:>


sns.pairplot(data=df1)

<seaborn.axisgrid.PairGrid at 0x2118bb088b0>


sns.pairplot(data=df1, kind="hist")

<seaborn.axisgrid.PairGrid at 0x21192100be0>


# with Histograms we can see the shape of each feature and provides the count of number of observations in each bin.
df1.hist(figsize=(12,12), layout=(5,3));


df1.plot(kind='box', subplots=True, layout=(5,3), figsize=(12,12))
plt.show()


sns.catplot(data=df1, x='sex', y='age', hue='target', palette='husl')

<seaborn.axisgrid.FacetGrid at 0x2119dd9e850>


sns.barplot(data=df1, x='sex', y='chol', hue='target', palette='spring')

<AxesSubplot:xlabel='sex', ylabel='chol'>


df1['sex'].value_counts()

1    154
0     74
Name: sex, dtype: int64


df1['cp'].value_counts()

0    108
2     61
1     42
3     17
Name: cp, dtype: int64


sns.countplot(x='cp', hue='target', data=df, palette='rocket')

<AxesSubplot:xlabel='cp', ylabel='count'>


gen = pd.crosstab(df1['sex'], df['target'])
print(gen)

target   0   1
sex           
0       12  62
1       84  70


df1.head()


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
StandardScaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[columns_to_scale] = StandardScaler.fit_transform(df[columns_to_scale])

<ipython-input-25-9c6c969fdeb6>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_scale] = StandardScaler.fit_transform(df[columns_to_scale])
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:1736: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


df1.head()


#We will divide our dataset into training data and test data. 70% of the data were used as training and the remaining 30% will be used for testing.
x = df1.drop("target", axis=1)
y = df1["target"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)


#check sample size
print('x_train', x_train.size)
print('x_test', x_test.size)
print('y_train', y_train.size)
print('y_test', y_test.size)

x_train 2067
x_test 897
y_train 159
y_test 69


#Building classification models
names = ['Age', 'Sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']
#   ****************Logistic Regression*****************
logReg = LogisticRegression(random_state=0, solver='liblinear')
logReg.fit(x_train, y_train)
#Check accuracy of Logistic Regression
y_pred_logReg = logReg.predict(x_test)
#Model Accuracy
print("Accuracy of logistic regression classifier :: " ,metrics.accuracy_score(y_test,y_pred_logReg))
#Removing the features with low correlation and checking effect on accuracy of model
x_train1 = x_train.drop("fbs",axis=1)
x_train1 = x_train1.drop("trestbps", axis=1)
x_train1 = x_train1.drop("chol", axis=1)
x_train1 = x_train1.drop("restecg", axis=1)
x_test1 = x_test.drop("fbs", axis=1)
x_test1 = x_test1.drop("trestbps", axis=1)
x_test1 = x_test1.drop("chol", axis=1)
x_test1 = x_test1.drop("restecg", axis=1)
logReg1 = LogisticRegression(random_state=0, solver='liblinear').fit(x_train1,y_train)
y_pred_logReg1 = logReg1.predict(x_test1)
print("nAccuracy of logistic regression classifier after removing features:: " ,metrics.accuracy_score(y_test,y_pred_logReg1))

Accuracy of logistic regression classifier ::  0.855072463768116
nAccuracy of logistic regression classifier after removing features::  0.8695652173913043


# ***********************Decision Tree Classification***********************
decTree = DecisionTreeClassifier(max_depth=6, random_state=0)
decTree.fit(x_train,y_train)
y_pred_decTree = decTree.predict(x_test)
print("Accuracy of Decision Trees :: " , metrics.accuracy_score(y_test,y_pred_decTree))
#Remove features which have low correlation with output (fbs, trtbps, chol)
x_train_dt = x_train.drop("fbs",axis=1)
x_train_dt = x_train_dt.drop("trestbps", axis=1)
x_train_dt = x_train_dt.drop("chol", axis=1)
x_train_dt = x_train_dt.drop("age", axis=1)
x_train_dt = x_train_dt.drop("sex", axis=1)
x_test_dt = x_test.drop("fbs", axis=1)
x_test_dt = x_test_dt.drop("trestbps", axis=1)
x_test_dt = x_test_dt.drop("chol", axis=1)
x_test_dt = x_test_dt.drop("age", axis=1)
x_test_dt = x_test_dt.drop("sex", axis=1)
decTree1 = DecisionTreeClassifier(max_depth=6, random_state=0)
decTree1.fit(x_train_dt, y_train)
y_pred_dt1 = decTree1.predict(x_test_dt)
print("Accuracy of decision Tree after removing features:: ", metrics.accuracy_score(y_test,y_pred_dt1))

Accuracy of Decision Trees ::  0.7391304347826086
Accuracy of decision Tree after removing features::  0.7681159420289855


# Using Random forest classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print("Accuracy of Random Forest Classifier :: ", metrics.accuracy_score(y_test, y_pred_rf))
#Find the score of each feature in model and drop the features with low scores
f_imp = rf.feature_importances_
for i,v in enumerate(f_imp):
    print('Feature: %s, Score: %.5f' % (names[i],v))

Accuracy of Random Forest Classifier ::  0.7971014492753623
Feature: Age, Score: 0.11355
Feature: Sex, Score: 0.03915
Feature: cp, Score: 0.11459
Feature: trestbps, Score: 0.07066
Feature: chol, Score: 0.07456
Feature: fbs, Score: 0.00000
Feature: restecg, Score: 0.01775
Feature: thalachh, Score: 0.11485
Feature: exng, Score: 0.02684
Feature: oldpeak, Score: 0.12835
Feature: slp, Score: 0.03607
Feature: caa, Score: 0.11449
Feature: thall, Score: 0.14914


#K Neighbours Classifier
knc =  KNeighborsClassifier()
knc.fit(x_train,y_train)
y_pred_knc = knc.predict(x_test)
print("Accuracy of K-Neighbours classifier :: ", metrics.accuracy_score(y_test,y_pred_knc))

Accuracy of K-Neighbours classifier ::  0.6231884057971014


#Models and their accuracy
print("*****************Models and their accuracy*****************")
print("Logistic Regression Classifier :: ", metrics.accuracy_score(y_test,y_pred_logReg1))
print("Decision Tree :: ", metrics.accuracy_score(y_test,y_pred_dt1))
print("Random Forest Classifier :: ", metrics.accuracy_score(y_test, y_pred_rf))
print("K Neighbours Classifier :: ", metrics.accuracy_score(y_test,y_pred_knc))

*****************Models and their accuracy*****************
Logistic Regression Classifier ::  0.8695652173913043
Decision Tree ::  0.7681159420289855
Random Forest Classifier ::  0.7971014492753623
K Neighbours Classifier ::  0.6231884057971014


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_logReg1)
cm

array([[17,  3],
       [ 6, 43]], dtype=int64)


sns.heatmap(cm, annot=True, cmap='BuPu')

<AxesSubplot:>


TP = cm[0][0]
TN = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]

print('Testing accuracy:', (TP+TN) / (TP+TN+FN+FP))

Testing accuracy: 0.8695652173913043


from sklearn.metrics import classification_report
print(classification_report(y_test,  y_pred_logReg1))

              precision    recall  f1-score   support

           0       0.74      0.85      0.79        20
           1       0.93      0.88      0.91        49

    accuracy                           0.87        69
   macro avg       0.84      0.86      0.85        69
weighted avg       0.88      0.87      0.87        69

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

Supervised Machine Learning in Python

Beatriz Manso

Set working directory:¶

Load Libraries:¶

1. Load the Data¶

DataSet Description¶

2. Check duplicate rows in data¶

3. Checking the Null Values¶

4. Detecting Outliers¶

a. Detecting Outliers using IQR (InterQuartile Range)¶

b. Removing outliers using Inter-Quartile Range¶

5. Data visualisation¶

Box and whiskers plots:¶

Visualize the features and their relation with the target (Heart Disease or No Heart Disease)¶

Chest pain type¶

Cross tables:¶

6. Preparing the data for model¶

Comparison Between Unscaled and Scaled DataFrame¶

a. Scaling the data:¶

7. Fitting the Data¶

1. Logistic Regression Classifier¶

2. Decision Tree Classifier¶

3. Random Forest Classifier¶

4. K Nearest Neighbours Classifier¶

8. Assessing Accuracies of each model¶

Logistic Regression Classifier had the highest accuracy¶

9. Confusion Matrix¶

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	thal	target
1	37	1	2	130	250	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	172	0	1.4	2	2	1
3	56	1	1	120	236	1	178	0	0.8	2	2	1
4	57	0	0	120	354	1	163	1	0.6	2	2	1
5	57	1	0	140	192	1	148	0	0.4	1	1	1

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	thal	target
1	37	1	2	130	250	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	172	0	1.4	2	2	1
3	56	1	1	120	236	1	178	0	0.8	2	2	1
4	57	0	0	120	354	1	163	1	0.6	2	2	1
5	57	1	0	140	192	1	148	0	0.4	1	1	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	thal	target
1	37	1	2	130	250	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	172	0	1.4	2	2	1
3	56	1	1	120	236	1	178	0	0.8	2	2	1
4	57	0	0	120	354	1	163	1	0.6	2	2	1
5	57	1	0	140	192	1	148	0	0.4	1	1	1

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	thal	target
1	37	1	2	130	250	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	172	0	1.4	2	2	1
3	56	1	1	120	236	1	178	0	0.8	2	2	1
4	57	0	0	120	354	1	163	1	0.6	2	2	1
5	57	1	0	140	192	1	148	0	0.4	1	1	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	thal	target
1	37	1	2	130	250	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	172	0	1.4	2	2	1
3	56	1	1	120	236	1	178	0	0.8	2	2	1
4	57	0	0	120	354	1	163	1	0.6	2	2	1
5	57	1	0	140	192	1	148	0	0.4	1	1	1

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	thal	target
1	37	1	2	130	250	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	172	0	1.4	2	2	1
3	56	1	1	120	236	1	178	0	0.8	2	2	1
4	57	0	0	120	354	1	163	1	0.6	2	2	1
5	57	1	0	140	192	1	148	0	0.4	1	1	1