eQTL Analysis with MatrixEQTL

Beatriz Manso
2022-04-28

Introduction

Expression quantitative trait loci (eQTL) analysis are designed to find genetic variations that affect the expression of one or more genes.

For example, a gene-SNP pair for which the expression of the gene correlates with the allelic configuration of the SNP is considered an eQTL.

Gene expression phenotypes are explained by eQTLs, which account for a fraction of genetic variance. EQTL analyses involve the direct correlation of genetic variation with gene expression levels typically measured across tens or hundreds of individuals.

We will explore the process using the codes and data provided in the eQTL package.

Methods

Set working Directory:

setwd("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/6.BGA - Bioinformatics and Genome Analysis/week 6 - eQTL/practical")

Install required packages and load libraries:

if (!requireNamespace("MatrixEQTL", quietly = TRUE))
   install.packages("MatrixEQTL")

library(MatrixEQTL)

This package contains 5 datasets:

  1. snpsloc.txt
  2. geneloc.txt
  3. Covariates.txt
  4. GE.txt
  5. SNP.txt

1. Test all gene-SNP pairs and plot a histogram of all p-values:

Location of the package with the data files.

base.dir = find.package('MatrixEQTL')

Settings

#modelANOVA, modelLINEAR, or modelLINEAR_CROSS
useModel = modelLINEAR  
SNP_file_name = paste(base.dir, "/data/SNP.txt", sep="")
expression_file_name = paste(base.dir, "/data/GE.txt", sep="")
covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep="")
output_file_name = tempfile()
pvOutputThreshold = 1e-2
errorCovariance = numeric()
#errorCovariance = read.table("Sample_Data/errorCovariance.txt")

Load genotype data

snps = SlicedData$new()
snps$fileDelimiter = "\t"       # the TAB character
snps$fileOmitCharacters = "NA"  # denote missing values;
snps$fileSkipRows = 1           # one row of column labels
snps$fileSkipColumns = 1        # one column of row labels
snps$fileSliceSize = 2000       # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)

Load gene expression data

gene = SlicedData$new();
gene$fileDelimiter = "\t";        # the TAB character
gene$fileOmitCharacters = "NA";   # denote missing values;
gene$fileSkipRows = 1;            # one row of column labels
gene$fileSkipColumns = 1;         # one column of row labels
gene$fileSliceSize = 2000;        # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)

Load covariates

cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"       # the TAB character
cvrt$fileOmitCharacters = "NA"  # denote missing values;
cvrt$fileSkipRows = 1           # one row of column labels
cvrt$fileSkipColumns = 1        # one column of row labels

if(length(covariates_file_name)>0) {
  cvrt$LoadFile(covariates_file_name)
  }

Run analysis

me = Matrix_eQTL_engine(snps = snps,
                        gene = gene,
                        cvrt = cvrt,
                        output_file_name = output_file_name,
                        pvOutputThreshold = pvOutputThreshold,
                        useModel = useModel,
                        errorCovariance = errorCovariance,
                        verbose = TRUE,
                        pvalue.hist = TRUE,
                        min.pv.by.genesnp = FALSE,
                        noFDRsaveMemory = FALSE)

unlink(output_file_name)

Results

cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');
Analysis done in:  0.02  seconds 
cat( 'Detected eQTLs: ', '\n');
Detected eQTLs:  
show(me$all$eqtls)
    snps    gene statistic       pvalue          FDR       beta
1 Snp_05 Gene_03 38.812160 5.515519e-14 8.273279e-12  0.4101317
2 Snp_13 Gene_09 -3.914403 2.055817e-03 1.541863e-01 -0.2978847
3 Snp_11 Gene_06 -3.221962 7.327756e-03 2.853368e-01 -0.2332470
4 Snp_04 Gene_10  3.201666 7.608981e-03 2.853368e-01  0.2321123
5 Snp_14 Gene_01  3.070005 9.716705e-03 2.915011e-01  0.2147077
#We can also figure look at the number and type of eQTL
show(me$all$neqtls)
[1] 5

View the eQTL in the table

View(me$all$eqtls)

View() will open the table in another tab in R

Plot the histogram of all p-values

plot(me)

2. Test local and distand gene-SNP pairs separately and plot Q-Q plots of local and distant p-values:

Location of the package with the data files.

base.dir = find.package('MatrixEQTL')
# base.dir = '.'

Settings

useModel = modelLINEAR  # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
SNP_file_name = paste(base.dir, "/data/SNP.txt", sep="")
snps_location_file_name = paste(base.dir, "/data/snpsloc.txt", sep="")
expression_file_name = paste(base.dir, "/data/GE.txt", sep="")
gene_location_file_name = paste(base.dir, "/data/geneloc.txt", sep="");
covariates_file_name = paste(base.dir, "/data/Covariates.txt", sep="")
output_file_name_cis = tempfile()
output_file_name_tra = tempfile()
pvOutputThreshold_cis = 2e-2
pvOutputThreshold_tra = 1e-2
errorCovariance = numeric()
#errorCovariance = read.table("Sample_Data/errorCovariance.txt")
cisDist = 1e6

Load genotype data

snps = SlicedData$new()
snps$fileDelimiter = "\t"       # the TAB character
snps$fileOmitCharacters = "NA"  # denote missing values;
snps$fileSkipRows = 1           # one row of column labels
snps$fileSkipColumns = 1        # one column of row labels
snps$fileSliceSize = 2000       # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)

Load gene expression data

gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)

Load covariates

cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"      # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1          # one row of column labels
cvrt$fileSkipColumns = 1       # one column of row labels

if(length(covariates_file_name)>0) {
  cvrt$LoadFile(covariates_file_name)
  }

Run the analysis

snpspos = read.table(snps_location_file_name,
                     header = TRUE,
                     stringsAsFactors = FALSE)

genepos = read.table(gene_location_file_name,
                     header = TRUE,
                     stringsAsFactors = FALSE)

me = Matrix_eQTL_main( snps = snps, 
                       gene = gene,
                       cvrt = cvrt,
                       output_file_name = output_file_name_tra,
                       pvOutputThreshold = pvOutputThreshold_tra, 
                       useModel = useModel,
                       errorCovariance = errorCovariance,
                       verbose = TRUE,
                       output_file_name.cis = output_file_name_cis,
                       pvOutputThreshold.cis = pvOutputThreshold_cis,
                       snpspos = snpspos,
                       genepos = genepos,
                       cisDist = cisDist,
                       pvalue.hist = "qqplot",
                       min.pv.by.genesnp = FALSE,
                       noFDRsaveMemory = FALSE)

unlink(output_file_name_tra)
unlink(output_file_name_cis)

Results:

cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n')
Analysis done in:  0.03  seconds 
cat('Detected local eQTLs:', '\n')
Detected local eQTLs: 
show(me$cis$eqtls)
    snps    gene statistic       pvalue          FDR      beta
1 Snp_05 Gene_03 38.812160 5.515519e-14 5.515519e-12 0.4101317
2 Snp_04 Gene_10  3.201666 7.608981e-03 3.804491e-01 0.2321123
cat('Detected distant eQTLs:', '\n')
Detected distant eQTLs: 
show(me$trans$eqtls)
    snps    gene statistic      pvalue       FDR       beta
1 Snp_13 Gene_09 -3.914403 0.002055817 0.1027908 -0.2978847
2 Snp_11 Gene_06 -3.221962 0.007327756 0.1619451 -0.2332470
3 Snp_14 Gene_01  3.070005 0.009716705 0.1619451  0.2147077

Plot the Q-Q plot of local and distant p-values

plot(me)

3. Create an artificial dataset and plot the histogram and Q-Q plot of all p-values

Define number of samples

n = 100

Define number of variables

ngs = 2000

Common signal in all variables (population stratification)

pop = 0.2 * rnorm(n)

Data matrices

snps.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop
gene.mat = matrix(rnorm(n*ngs), ncol = ngs) + pop + snps.mat*((1:ngs)/ngs)^9/2

Data objects for Matrix eQTL engine

snps1 = SlicedData$new( t( snps.mat ) )
gene1 = SlicedData$new( t( gene.mat ) )
cvrt1 = SlicedData$new( )

rm(snps.mat, gene.mat)

Slice data in blocks of 500 variables

snps1$ResliceCombined(500)
gene1$ResliceCombined(500)

Name of temporary output file

filename = tempfile()

Perform analysis recording information for a histogram

meh = Matrix_eQTL_engine(snps = snps1,
                         gene = gene1, 
                         cvrt = cvrt1,
                         output_file_name = filename,
                         pvOutputThreshold = 1e-100,
                         useModel = modelLINEAR,
                         errorCovariance = numeric(),
                         verbose = TRUE,
                         pvalue.hist = 100
                         )

unlink(filename)

Create PNG with the histogram and plot it

png(filename = "histogram.png", width = 650, height = 650)
plot(meh, col="grey")
# dev.off()

Perform analysis recording information for a Q-Q plot

meq = Matrix_eQTL_engine(snps = snps1,
                         gene = gene1,
                         cvrt = cvrt1,
                         output_file_name = filename,
                         pvOutputThreshold = 1e-6,
                         useModel = modelLINEAR,
                         errorCovariance = numeric(),
                         verbose = TRUE,
                         pvalue.hist = "qqplot")

unlink(filename)

Create PNG with the QQplot and plot it

plot(meq, pch = 16, cex = 0.7)
# dev.off()