Machine learning (ML) is a type of artificial intelligence (AI) that allows software applications to become more accurate at predicting outcomes without being explicitly programmed to do so. Machine learning algorithms use historical data as input to predict new output values. ML can be used in many fields including bioinformatics, for genomics, proteomics, microarrays, ...
import os
os.chdir('C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/DSB W11 - Exploratory Data Analysis with Unsupervised Machine Learning')
os.getcwd()
'C:\\Users\\manso\\OneDrive - University of West London\\MSc Bioinformatics - UWL\\3.DSB - Data Science for Bioinformatics\\Practice\\DSB W11 - Exploratory Data Analysis with Unsupervised Machine Learning'
import pandas as pd
Table1 = pd.read_excel('Table1.xlsx')
print(Table1)
Unnamed: 0 IRX4 OCT4 PAX6 0 patient1 11 10 1 1 patient2 13 13 3 2 patient3 2 4 10 3 patient4 1 3 9
rmpt = Table1.iloc[0:4,1:4]
import numpy as np
T1data = rmpt.to_numpy()
print(T1data)
[[11 10 1] [13 13 3] [ 2 4 10] [ 1 3 9]]
from Bio.Cluster import distancematrix
distancesE = distancematrix(T1data, dist='e')
print("Euclidean distance \n", distancesE, "\n")
Euclidean distance [array([], dtype=float64), array([5.66666667]), array([66. , 83.66666667]), array([71. , 93.33333333, 1. ])]
Edf = pd.DataFrame(distancesE)
print("Euclidean distance dataframe \n", Edf, "\n")
Euclidean distance dataframe 0 1 2 0 NaN NaN NaN 1 5.666667 NaN NaN 2 66.000000 83.666667 NaN 3 71.000000 93.333333 1.0
distancesM = distancematrix(T1data, dist='b')
print("Manhattan \n", distancesM, "\n")
Mdf = pd.DataFrame(distancesM)
print("Manhattan distance dataframe \n", Mdf, "\n")
Manhattan [array([], dtype=float64), array([2.33333333]), array([8., 9.]), array([8.33333333, 9.33333333, 1. ])] Manhattan distance dataframe 0 1 2 0 NaN NaN NaN 1 2.333333 NaN NaN 2 8.000000 9.000000 NaN 3 8.333333 9.333333 1.0
T2 = pd.read_csv("Table2.csv", index_col=0)
print(T2)
t1 t2 t3 t4 t5 g1 -0.129767 0.022354 -0.954649 0.110975 0.388491 g2 -0.337756 -1.188563 -1.449266 0.283642 0.357560 g3 0.196316 -0.582125 0.898658 -0.766928 -1.945763 g4 0.727994 0.134512 0.350455 -0.451750 -0.619588 g5 0.124324 1.729350 -0.062252 0.030506 0.457010 g6 -1.595889 1.141006 1.397045 -0.463734 -1.689223 g7 -0.216876 -0.400143 0.715925 -1.041363 0.971422 g8 0.558435 -1.272570 0.723619 -1.114531 -0.648241 g9 -0.005076 1.475324 0.096234 -0.786503 0.926660 g10 -1.135467 1.630329 0.374782 -1.121340 -0.007128
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
T2_scaled = scale.fit_transform(T2)
print(T2_scaled)
[[ 0.076251 -0.22554605 -1.42739099 1.29334503 0.5870607 ] [-0.23104779 -1.33310594 -2.03408429 1.64061069 0.55516874] [ 0.55803174 -0.77842989 0.84586172 -0.47227618 -1.81971411] [ 1.34357502 -0.12296089 0.17344077 0.16160417 -0.45233774] [ 0.45166593 1.33575151 -0.33278304 1.13150784 0.65770773] [-2.08991155 0.79762475 1.45717952 0.13750113 -1.55520469] [-0.05244971 -0.61198092 0.62172256 -1.02421519 1.18810151] [ 1.0930545 -1.40994286 0.63116058 -1.17136901 -0.48188123] [ 0.26047923 1.1034079 -0.13838587 -0.51164515 1.14194924] [-1.40964837 1.2451824 0.20327903 -1.18506333 0.17914984]]
from Bio.Cluster import distancematrix
dist_ET2 = distancematrix(T2_scaled, dist='e')
print("Table 2 Euclidean distance: \n", dist_ET2, "\n")
Table 2 Euclidean distance: [array([], dtype=float64), array([0.36216175]), array([2.92309087, 3.8657521 ]), array([1.30809654, 2.4039169 , 0.75407804]), array([0.76158706, 2.1506048 , 2.91600446, 1.07050406]), array([3.99703235, 5.37946501, 2.06221099, 3.10023792, 3.16763863]), array([2.01941951, 3.02142589, 1.96044454, 1.29723814, 1.97746689, 3.14230818]), array([2.77835436, 3.56907668, 0.60193643, 0.74125617, 3.09626178, 3.71041983, 0.95190177]), array([1.40549674, 2.9496775 , 2.67431881, 1.15386568, 0.61255355, 3.17193985, 0.77662124, 2.13487526]), array([2.66778462, 4.23365772, 2.57664138, 2.33304707, 1.87111517, 1.39849893, 1.30199887, 2.78668881, 0.86132717])]
DT2 = pd.DataFrame(dist_ET2)
print("Table 2 Euclidean distance dataframe: \n", DT2, "\n")
Table 2 Euclidean distance dataframe: 0 1 2 3 4 5 6 \ 0 NaN NaN NaN NaN NaN NaN NaN 1 0.362162 NaN NaN NaN NaN NaN NaN 2 2.923091 3.865752 NaN NaN NaN NaN NaN 3 1.308097 2.403917 0.754078 NaN NaN NaN NaN 4 0.761587 2.150605 2.916004 1.070504 NaN NaN NaN 5 3.997032 5.379465 2.062211 3.100238 3.167639 NaN NaN 6 2.019420 3.021426 1.960445 1.297238 1.977467 3.142308 NaN 7 2.778354 3.569077 0.601936 0.741256 3.096262 3.710420 0.951902 8 1.405497 2.949678 2.674319 1.153866 0.612554 3.171940 0.776621 9 2.667785 4.233658 2.576641 2.333047 1.871115 1.398499 1.301999 7 8 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN 5 NaN NaN 6 NaN NaN 7 NaN NaN 8 2.134875 NaN 9 2.786689 0.861327
dist_ST2 = distancematrix(T2_scaled, dist='s')
print("Table 2 Spearman's rank correlation': \n", dist_ST2, "\n")
Table 2 Spearman's rank correlation': [array([], dtype=float64), array([0.]), array([1.5, 1.5]), array([1.3, 1.3, 0.1]), array([0.6, 0.6, 1.7, 1.6]), array([1.6, 1.6, 0.7, 1.1, 1. ]), array([1.3, 1.3, 1.1, 1.2, 1.6, 1.1]), array([1.2, 1.2, 0.4, 0.3, 1.9, 1.4, 0.5]), array([1.1, 1.1, 1.7, 1.6, 0.8, 1.3, 0.4, 1.1]), array([1.6, 1.6, 1.2, 1.5, 0.7, 0.2, 0.9, 1.6, 0.7])]
ST2 = pd.DataFrame(dist_ET2)
print("Table 2 Spearman’s rank correlation dataframe: \n", ST2, "\n")
Table 2 Spearman’s rank correlation dataframe: 0 1 2 3 4 5 6 \ 0 NaN NaN NaN NaN NaN NaN NaN 1 0.362162 NaN NaN NaN NaN NaN NaN 2 2.923091 3.865752 NaN NaN NaN NaN NaN 3 1.308097 2.403917 0.754078 NaN NaN NaN NaN 4 0.761587 2.150605 2.916004 1.070504 NaN NaN NaN 5 3.997032 5.379465 2.062211 3.100238 3.167639 NaN NaN 6 2.019420 3.021426 1.960445 1.297238 1.977467 3.142308 NaN 7 2.778354 3.569077 0.601936 0.741256 3.096262 3.710420 0.951902 8 1.405497 2.949678 2.674319 1.153866 0.612554 3.171940 0.776621 9 2.667785 4.233658 2.576641 2.333047 1.871115 1.398499 1.301999 7 8 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN 5 NaN NaN 6 NaN NaN 7 NaN NaN 8 2.134875 NaN 9 2.786689 0.861327
from Bio.Cluster import treecluster
T2tree = treecluster(data=T2_scaled)
T2tree2 = treecluster(data=T2_scaled, dist='s')
T2tree3 = treecluster(data=None, distancematrix=dist_ST2)
T2tree
<Bio.Cluster.Tree at 0x2a8f7af9600>