Expression quantitative trait loci (eQTL) analysis are designed to find genetic variations that affect the expression of one or more genes.
For example, a gene-SNP pair for which the expression of the gene correlates with the allelic configuration of the SNP is considered an eQTL.
Gene expression phenotypes are explained by eQTLs, which account for a fraction of genetic variance. EQTL analyses involve the direct correlation of genetic variation with gene expression levels typically measured across tens or hundreds of individuals.
We will explore the process using the codes and data provided in the eQTL package.
Set working Directory:
setwd("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/6.BGA - Bioinformatics and Genome Analysis/week 6 - eQTL/practical")
Install required packages and load libraries:
if (!requireNamespace("MatrixEQTL", quietly = TRUE))
install.packages("MatrixEQTL")
library(MatrixEQTL)
This package contains 5 datasets:
base.dir = find.package('MatrixEQTL')
#modelANOVA, modelLINEAR, or modelLINEAR_CROSS
useModel = modelLINEAR
SNP_file_name = paste(base.dir, "/data/SNP.txt", sep="")
expression_file_name = paste(base.dir, "/data/GE.txt", sep="")
covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep="")
output_file_name = tempfile()
pvOutputThreshold = 1e-2
errorCovariance = numeric()
#errorCovariance = read.table("Sample_Data/errorCovariance.txt")
snps = SlicedData$new()
snps$fileDelimiter = "\t" # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1 # one row of column labels
snps$fileSkipColumns = 1 # one column of row labels
snps$fileSliceSize = 2000 # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
gene = SlicedData$new();
gene$fileDelimiter = "\t"; # the TAB character
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1; # one row of column labels
gene$fileSkipColumns = 1; # one column of row labels
gene$fileSliceSize = 2000; # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t" # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1 # one row of column labels
cvrt$fileSkipColumns = 1 # one column of row labels
if(length(covariates_file_name)>0) {
cvrt$LoadFile(covariates_file_name)
}
me = Matrix_eQTL_engine(snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name,
pvOutputThreshold = pvOutputThreshold,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
pvalue.hist = TRUE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
unlink(output_file_name)
cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');
Analysis done in: 0.02 seconds
cat( 'Detected eQTLs: ', '\n');
Detected eQTLs:
show(me$all$eqtls)
snps gene statistic pvalue FDR beta
1 Snp_05 Gene_03 38.812160 5.515519e-14 8.273279e-12 0.4101317
2 Snp_13 Gene_09 -3.914403 2.055817e-03 1.541863e-01 -0.2978847
3 Snp_11 Gene_06 -3.221962 7.327756e-03 2.853368e-01 -0.2332470
4 Snp_04 Gene_10 3.201666 7.608981e-03 2.853368e-01 0.2321123
5 Snp_14 Gene_01 3.070005 9.716705e-03 2.915011e-01 0.2147077
#We can also figure look at the number and type of eQTL
show(me$all$neqtls)
[1] 5
View(me$all$eqtls)
View() will open the table in another tab in R
plot(me)
base.dir = find.package('MatrixEQTL')
# base.dir = '.'
useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep="")
pvOutputThreshold_cis = 2e-2
pvOutputThreshold_tra = 1e-2
errorCovariance = numeric()
#errorCovariance = read.table("Sample_Data/errorCovariance.txt")
cisDist = 1e6
snps = SlicedData$new()
snps$fileDelimiter = "\t" # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1 # one row of column labels
snps$fileSkipColumns = 1 # one column of row labels
snps$fileSliceSize = 2000 # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
gene = SlicedData$new()
gene$fileDelimiter = "\t" # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1 # one row of column labels
gene$fileSkipColumns = 1 # one column of row labels
gene$fileSliceSize = 2000 # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t" # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1 # one row of column labels
cvrt$fileSkipColumns = 1 # one column of row labels
if(length(covariates_file_name)>0) {
cvrt$LoadFile(covariates_file_name)
}
snpspos = read.table(snps_location_file_name,
header = TRUE,
stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name,
header = TRUE,
stringsAsFactors = FALSE)
me = Matrix_eQTL_main( snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name_tra,
pvOutputThreshold = pvOutputThreshold_tra,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snpspos,
genepos = genepos,
cisDist = cisDist,
pvalue.hist = "qqplot",
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
unlink(output_file_name_tra)
unlink(output_file_name_cis)
cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n')
Analysis done in: 0.03 seconds
cat('Detected local eQTLs:', '\n')
Detected local eQTLs:
show(me$cis$eqtls)
snps gene statistic pvalue FDR beta
1 Snp_05 Gene_03 38.812160 5.515519e-14 5.515519e-12 0.4101317
2 Snp_04 Gene_10 3.201666 7.608981e-03 3.804491e-01 0.2321123
cat('Detected distant eQTLs:', '\n')
Detected distant eQTLs:
show(me$trans$eqtls)
snps gene statistic pvalue FDR beta
1 Snp_13 Gene_09 -3.914403 0.002055817 0.1027908 -0.2978847
2 Snp_11 Gene_06 -3.221962 0.007327756 0.1619451 -0.2332470
3 Snp_14 Gene_01 3.070005 0.009716705 0.1619451 0.2147077
plot(me)
n = 100
ngs = 2000
pop = 0.2 * rnorm(n)
snps1 = SlicedData$new( t( snps.mat ) )
gene1 = SlicedData$new( t( gene.mat ) )
cvrt1 = SlicedData$new( )
rm(snps.mat, gene.mat)
snps1$ResliceCombined(500)
gene1$ResliceCombined(500)
filename = tempfile()
meh = Matrix_eQTL_engine(snps = snps1,
gene = gene1,
cvrt = cvrt1,
output_file_name = filename,
pvOutputThreshold = 1e-100,
useModel = modelLINEAR,
errorCovariance = numeric(),
verbose = TRUE,
pvalue.hist = 100
)
unlink(filename)
meq = Matrix_eQTL_engine(snps = snps1,
gene = gene1,
cvrt = cvrt1,
output_file_name = filename,
pvOutputThreshold = 1e-6,
useModel = modelLINEAR,
errorCovariance = numeric(),
verbose = TRUE,
pvalue.hist = "qqplot")
unlink(filename)
plot(meq, pch = 16, cex = 0.7)
# dev.off()