ChIP-seq (Chromatin immunoprecipitation with sequencing) is a method used to identify protein–DNA interactions through antibody pulldown, sequencing, and analysis, with peak enrichment as the most critical step.
Currently, ChIP-seq is the most commonly used method for determining protein-DNA interaction, and its application in mapping histone modifications is pivotal in epigenetic research.
Set working directory:
setwd("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/6.BGA - Bioinformatics and Genome Analysis/week 7 - ChIP-seq analysis/practical")
Install and Load Libraries:
packages = c(
'AnnotationHub',
'Biostrings',
'BSgenome',
'BSgenome.Hsapiens.UCSC.hg38',
'circlize',
'ComplexHeatmap',
'dplyr',
'GenomicAlignments',
'GenomicFeatures',
'GenomicRanges',
'GenomeInfoDB',
'ggplot2',
'Gviz',
'JASPAR2018',
'MotifDb',
'motifRG',
'motifStack',
'normr',
'rtracklayer',
'seqLogo',
'TFBSTools',
'tidyr',
'rGADEM',
'BSgenome.Hsapiens.UCSC.hg38')
BiocManager::install(packages)
library(GenomeInfoDb)
library(GenomicRanges)
library(GenomicAlignments)
library(ComplexHeatmap)
library(circlize)
library(rtracklayer)
library(Gviz)
library(ggplot2)
library(BSgenome.Hsapiens.UCSC.hg38)
library(tidyr)
library(AnnotationHub)
library(dplyr)
library(GenomicFeatures)
library(normr)
library(TFBSTools)
library(MotifDb)
library(rGADEM)
Download Data:
data = system.file('extdata/chip-seq',package='compGenomRData')
chipFiles = list.files(data, full.names=TRUE)
chipFiles
[1] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/CTCF_peaks.txt"
[2] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_CTCF_r1.chr21.bam"
[3] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_CTCF_r1.chr21.bam.bai"
[4] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_CTCF_r2.chr21.bam"
[5] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_CTCF_r2.chr21.bam.bai"
[6] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K27me3.chr21.bam"
[7] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K27me3.chr21.bam.bai"
[8] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K27me3.chr21.bw"
[9] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K36me3.chr21.bam"
[10] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K36me3.chr21.bam.bai"
[11] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K36me3.chr21.bw"
[12] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me1.chr21.bam"
[13] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me1.chr21.bam.bai"
[14] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me1.chr21.bw"
[15] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me3.chr21.bam"
[16] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me3.chr21.bam.bai"
[17] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me3.chr21.bw"
[18] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r1.chr21.bam"
[19] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r1.chr21.bam.bai"
[20] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r2.chr21.bam"
[21] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r2.chr21.bam.bai"
[22] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r3.chr21.bam"
[23] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r3.chr21.bam.bai"
[24] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r4.chr21.bam"
[25] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r4.chr21.bam.bai"
[26] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r5.chr21.bam"
[27] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r5.chr21.bam.bai"
[28] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_POLR2A.chr21.bw"
[29] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_SMC3_r1.chr21.bam"
[30] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_SMC3_r1.chr21.bam.bai"
[31] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_SMC3_r2.chr21.bam"
[32] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_SMC3_r2.chr21.bam.bai"
[33] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_ZNF143_r1.chr21.bam"
[34] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_ZNF143_r1.chr21.bam.bai"
[35] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_ZNF143_r2.chr21.bam"
[36] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_ZNF143_r2.chr21.bam.bai"
Original Data from: http://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-11431
The data set consist of the following ChIP experiments: 1. Transcription factors: CTCF, SMC3, ZNF143, PolII (RNA polymerase 2) 2. Histone modifications: H3k4me3, H3k36me3, H3k27ac, H3k27me3 3. Various input samples
In ChIP quality control, the aim is to determine whether the chromatin immunoprecipitation enrichment was successful. This step plays an important role in the ChIP-seq analysis, as it helps us identify low-quality ChIP samples and pinpoint any steps that went wrong.
Steps in the ChIP quality control: 1. Sample correlation clustering - clustering of the pair-wise correlations between genome-wide signal profiles. 2. Data visualization in a genomic browser. 3. Average fragment length determination - determining whether the ChIP enriched for fragments of a certain length. 4. Visualization of GC bias. Plot the ChIP enrichment Vs the average GC content in the corresponding genomic bin.
Use the GenomeInfoDb to fetch the chromosome lengths corresponding to the hg38 version of the human genome, and filter the length for human chromosome 21.
hg_chrs = getChromInfoFromUCSC('hg38')
tilling_window = tileGenome(seqlengths, tilewidth=1000)
tilling_window
GRangesList object of length 46710:
[[1]]
GRanges object with 1 range and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr21 1-1000 *
-------
seqinfo: 1 sequence from an unspecified genome
[[2]]
GRanges object with 1 range and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr21 1001-2000 *
-------
seqinfo: 1 sequence from an unspecified genome
[[3]]
GRanges object with 1 range and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr21 2001-3000 *
-------
seqinfo: 1 sequence from an unspecified genome
...
<46707 more elements>
#fetch bam files from the data folder
bam_files = list.files(data, full.names=TRUE, pattern='bam$')
bam_files
[1] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_CTCF_r1.chr21.bam"
[2] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_CTCF_r2.chr21.bam"
[3] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K27me3.chr21.bam"
[4] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K36me3.chr21.bam"
[5] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me1.chr21.bam"
[6] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_H3K4me3.chr21.bam"
[7] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r1.chr21.bam"
[8] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r2.chr21.bam"
[9] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r3.chr21.bam"
[10] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r4.chr21.bam"
[11] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_Input_r5.chr21.bam"
[12] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_SMC3_r1.chr21.bam"
[13] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_SMC3_r2.chr21.bam"
[14] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_ZNF143_r1.chr21.bam"
[15] "C:/Users/manso/Documents/R/win-library/4.1/compGenomRData/extdata/chip-seq/GM12878_hg38_ZNF143_r2.chr21.bam"
tileGenome() from the GenomicRanges package constructs equally sized windows over the genome of interest. The function takes two arguments.
#summarizeOverlaps() to count the reads
so = summarizeOverlaps(tilling_window, bam_files)
so
class: RangedSummarizedExperiment
dim: 46710 15
metadata(0):
assays(1): counts
rownames: NULL
rowData names(0):
colnames(15): GM12878_hg38_CTCF_r1.chr21.bam
GM12878_hg38_CTCF_r2.chr21.bam ...
GM12878_hg38_ZNF143_r1.chr21.bam
GM12878_hg38_ZNF143_r2.chr21.bam
colData names(0):
# extract the counts from the SummarizedExperiment
counts = assays(so)[[1]]
For normalization procedure use cpm - counts per million. CPM=counts∗(106/totalnumberofreads)
cpm = cpm[rowSums(cpm) > 0,]
cor() and visualize with heatmap():
# calculates the pearson correlation coefficient between the samples
correlation_matrix = cor(cpm, method='pearson')
correlation_matrix
CTCF_r1 CTCF_r2 H3K27me3 H3K36me3
CTCF_r1 1.000000000 0.9406309080 0.0085609092 0.019875255
CTCF_r2 0.940630908 1.0000000000 0.0004698221 0.007835995
H3K27me3 0.008560909 0.0004698221 1.0000000000 0.479775621
H3K36me3 0.019875255 0.0078359946 0.4797756208 1.000000000
H3K4me1 0.161853186 0.1495350984 0.2948626416 0.449868590
H3K4me3 0.132773304 0.1307846337 0.4194343327 0.533215967
Input_r1 0.178574074 0.1974113131 -0.0226028409 -0.057379963
Input_r2 0.143500103 0.1315265070 0.0677646793 0.029916323
Input_r3 0.150040297 0.1635217861 -0.0191988955 -0.091241267
Input_r4 0.312228069 0.2846157426 0.0285260446 0.114239859
Input_r5 0.315726198 0.3227039153 -0.0067138519 0.034186624
SMC3_r1 0.598434402 0.5809349265 0.0203170578 0.012933894
SMC3_r2 0.822832808 0.8446257580 -0.0059829391 0.008793527
ZNF143_r1 0.135453490 0.1256535034 0.0058484644 0.015613323
ZNF143_r2 0.169343388 0.1557787004 0.0169864635 0.030783694
H3K4me1 H3K4me3 Input_r1 Input_r2 Input_r3
CTCF_r1 0.1618532 0.13277330 0.17857407 0.14350010 0.15004030
CTCF_r2 0.1495351 0.13078463 0.19741131 0.13152651 0.16352179
H3K27me3 0.2948626 0.41943433 -0.02260284 0.06776468 -0.01919890
H3K36me3 0.4498686 0.53321597 -0.05737996 0.02991632 -0.09124127
H3K4me1 1.0000000 0.59449293 0.18023965 0.11377345 0.11214333
H3K4me3 0.5944929 1.00000000 0.14112824 0.09128978 0.07230666
Input_r1 0.1802397 0.14112824 1.00000000 0.39330880 0.57495776
Input_r2 0.1137734 0.09128978 0.39330880 1.00000000 0.50997410
Input_r3 0.1121433 0.07230666 0.57495776 0.50997410 1.00000000
Input_r4 0.4023336 0.24137345 0.41809109 0.50604357 0.45835914
Input_r5 0.4257479 0.33207614 0.50476248 0.43657942 0.51496386
SMC3_r1 0.2488308 0.15850473 0.46051385 0.45576268 0.52400343
SMC3_r2 0.2916111 0.19782108 0.36272018 0.25167064 0.34192597
ZNF143_r1 0.0959422 0.19481348 0.11090068 0.11209700 0.10391235
ZNF143_r2 0.1313165 0.21805623 0.14347350 0.17880049 0.14239340
Input_r4 Input_r5 SMC3_r1 SMC3_r2 ZNF143_r1
CTCF_r1 0.31222807 0.315726198 0.59843440 0.822832808 0.135453490
CTCF_r2 0.28461574 0.322703915 0.58093493 0.844625758 0.125653503
H3K27me3 0.02852604 -0.006713852 0.02031706 -0.005982939 0.005848464
H3K36me3 0.11423986 0.034186624 0.01293389 0.008793527 0.015613323
H3K4me1 0.40233356 0.425747949 0.24883078 0.291611132 0.095942197
H3K4me3 0.24137345 0.332076142 0.15850473 0.197821075 0.194813481
Input_r1 0.41809109 0.504762481 0.46051385 0.362720185 0.110900676
Input_r2 0.50604357 0.436579420 0.45576268 0.251670635 0.112097001
Input_r3 0.45835914 0.514963860 0.52400343 0.341925967 0.103912346
Input_r4 1.00000000 0.640429497 0.62646135 0.452995126 0.192634849
Input_r5 0.64042950 1.000000000 0.56636214 0.523568436 0.266257749
SMC3_r1 0.62646135 0.566362138 1.00000000 0.717324530 0.189151853
SMC3_r2 0.45299513 0.523568436 0.71732453 1.000000000 0.155443794
ZNF143_r1 0.19263485 0.266257749 0.18915185 0.155443794 1.000000000
ZNF143_r2 0.26132924 0.323477724 0.24532198 0.197975614 0.978461747
ZNF143_r2
CTCF_r1 0.16934339
CTCF_r2 0.15577870
H3K27me3 0.01698646
H3K36me3 0.03078369
H3K4me1 0.13131652
H3K4me3 0.21805623
Input_r1 0.14347350
Input_r2 0.17880049
Input_r3 0.14239340
Input_r4 0.26132924
Input_r5 0.32347772
SMC3_r1 0.24532198
SMC3_r2 0.19797561
ZNF143_r1 0.97846175
ZNF143_r2 1.00000000
#Define the color palette which will be used in the heatmap; we are using library(circlize)
heatmap_col = circlize::colorRamp2(c(-1,0,1),c('blue','white','red'))
#plot the heatmap using the Heatmap() function
Heatmap(correlation_matrix, col = heatmap_col)
The CTCF protein is a zinc finger protein that forms a complex with the Cohesin protein. It should be expected that the SMC3 signal profile correlates highly with the CTCF signal profile, since SMC3 is a subunit of the Cohesin complex. It is true for the second biological replicate of SMC3, while the first one (SMC3_r1) clusters with the input samples, which suggests that the sample likely has low enrichment.
Furthermore, we can observe that ChIP and input samples form separate clusters, implying that the ChIP samples have an enrichment of fragments. Not only the ChIP samples but also biological replicates are grouped together.
In any ChIP-seq analysis, reviewing the data should be one of the first steps. Signal profiles can be plotted around regions of interest, or data can be loaded into a genome browser (such as IGV, or UCSC genome browsers).
Let’s start by importing the .bam file into R. From there we will calculate the signal profile (build the coverage vector), which will be exported as .bigWig.
chip_file = bam_files[1]
the reads into R:
reads = readGAlignments(chip_file)
reads = granges(reads)
reads
GRanges object with 103226 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr21 5038229-5038256 +
[2] chr21 5101975-5102002 +
[3] chr21 5101995-5102022 +
[4] chr21 5104372-5104399 +
[5] chr21 5138804-5138831 +
... ... ... ...
[103222] chr21 46675918-46675945 +
[103223] chr21 46676161-46676188 -
[103224] chr21 46679138-46679165 +
[103225] chr21 46680525-46680552 +
[103226] chr21 46681928-46681955 +
-------
seqinfo: 195 sequences from an unspecified genome
reads = resize(reads, width=200, fix='start')
reads = keepSeqlevels(reads, 'chr21', pruning.mode='coarse') # keeps only chromosome 21
cov = coverage(reads, width = seqlengths) # convert the reads into a signal profile
output_file = sub('.bam','.bigWig', chip_file)
export.bw(cov, 'output_file')
Gviz provides comprehensive, customized visualization of genomic experiments. First we define tracks that represent genomic annotation or signal profiles, which are then ordered and plotted.
2 tracks: one showing the position along chromosome 21 and another showing the signal from the CTCF experiment.
axis = GenomeAxisTrack(GRanges('chr21', IRanges(1, width=seqlengths)))
track_list = list(axis,dtrack)
track_list
[[1]]
Genome axis 'Axis'
There are annotated axis regions:
GRanges object with 1 range and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr21 1-46709983 *
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
[[2]]
DataTrack 'CTCF'
| genome: NA
| active chromosome: chr21
| positions: 199419
| samples:1
| strand: *
Tracks are plotted with plotTracks(). sizes argument needs to be the same size as the track_list, and defines the relative size of each track.
plotTracks(track_list, sizes=c(.1,1), background.title = "black")
# sizes argument defines the relative sizes of tracks
# background title defines the color for the track labels
The cross-correlation of plus and minus strands can be used to determine whether a DNA library was enriched for fragments of a particular length.
Load the reads:
reads = readGAlignments(chip_file)
reads = granges(reads)
# keep only the starting position of each read
reads = resize(reads, width=1, fix='start')
reads = keepSeqlevels(reads, 'chr21', pruning.mode='coarse')
# calculate the coverage profile for plus and minus strand
reads = split(reads, strand(reads))
# coverage(x, width = seqlengths)[[1]] > 0 calculates the coverage and converts the coverage vector into a boolean
cov = lapply(reads, function(x)coverage(x, width = seqlengths)[[1]] > 0)
cov = lapply(cov, as.vector)
# defines the shift range
wsize = 1:400
# defines the jaccard similarity
jaccard = function(x,y)sum((x & y)) / sum((x | y))
# shifts the + vector by 1 - 400 nucleotides and calculates the correlation coefficient
cc = shiftApply(wsize, cov[['+']], cov[['-']], FUN=jaccard)
# converts the results into a data frame
cc = data.frame(fragment_size = wsize, cross_correlation = cc)
ggplot(data = cc, aes(fragment_size, cross_correlation)) +
geom_point() +
geom_vline(xintercept = which.max(cc$cross_correlation),
size=2, color='red', linetype=2) +
theme_bw() +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5)) +
xlab('Shift in base pairs') +
ylab('Jaccard similarity')
hg_chrs = getChromInfoFromUCSC('hg38')
hg_chrs = subset(hg_chrs, grepl('chr21$',chrom))
seqlengths = with(hg_chrs, setNames(size, chrom))
tilling_window = unlist(tileGenome(seqlengths, tilewidth=1000))
seq = getSeq(BSgenome.Hsapiens.UCSC.hg38, tilling_window)
# calculate the frequency of all possible dimers in our sequence set
nuc = oligonucleotideFrequency(seq, width = 2)
# convert the matrix into a data.frame
nuc = as.data.frame(nuc)
# calculate the percentages, and rounds the number
nuc = round(nuc/1000,3)
gc = cbind(data.frame(cpm_log), GC = nuc['GC'])
ggplot(data = gc, aes(GC, GM12878_hg38_CTCF_r1.chr21.bam)) +
geom_point(size=2, alpha=.3) +
theme_bw() +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5)) +
xlab('GC content in one kilobase windows') +
ylab('log10( cpm + 1 )') +
ggtitle('CTCF Replicate 1')
FIGURE: GC content abundance in a ChIP-seq experiment
Comparison of plots corresponding to multiple experiments.
#Gather() converts a fat data.frame into a tall data.frame, which is the format used by the ggplot package
gcd = gather(data = gc, experiment, cpm, -GC)
#Select the ChIP files corresponding to the ctcf experiment
gcd = subset(gcd, grepl('CTCF', experiment))
#Remove the chr21 suffix
gcd$experiment = sub('chr21.','',gcd$experiment)
ggplot(data = gcd, aes(GC, log10(cpm+1))) +
geom_point(size=2, alpha=.05) +
theme_bw() +
facet_wrap(~experiment, nrow=1)+
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5)) +
xlab('GC content in one kilobase windows') +
ylab('log10( cpm + 1 )') +
ggtitle('CTCF Replicates 1 and 2')
# connect to the hub object
hub = AnnotationHub()
# query the hub for the human annotation
AnnotationHub::query(hub, c('ENSEMBL','Homo','GRCh38','chr','gtf'))
AnnotationHub with 42 records
# snapshotDate(): 2021-10-20
# $dataprovider: Ensembl
# $species: Homo sapiens, homo sapiens
# $rdataclass: GRanges
# additional mcols(): taxonomyid, genome, description,
# coordinate_1_based, maintainer, rdatadateadded,
# preparerclass, tags, rdatapath, sourceurl, sourcetype
# retrieve records with, e.g., 'object[["AH50842"]]'
title
AH50842 | Homo_sapiens.GRCh38.84.chr.gtf
AH50843 | Homo_sapiens.GRCh38.84.chr_patch_hapl_scaff.gtf
AH51012 | Homo_sapiens.GRCh38.85.chr.gtf
AH51013 | Homo_sapiens.GRCh38.85.chr_patch_hapl_scaff.gtf
AH51953 | Homo_sapiens.GRCh38.86.chr.gtf
... ...
AH89861 | Homo_sapiens.GRCh38.103.chr_patch_hapl_scaff.gtf
AH92107 | Homo_sapiens.GRCh38.104.chr.gtf
AH92108 | Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.gtf
AH98493 | Homo_sapiens.GRCh38.105.chr.gtf
AH98494 | Homo_sapiens.GRCh38.105.chr_patch_hapl_scaff.gtf
gtf = gtf[seqnames(gtf) == 'chr21']
# construct a GRangesList with human annotation
annotation_list = GRangesList(
tss = promoters(subset(gtf, type=='gene'),
upstream=1000, downstream=1000),
exon = subset(gtf, type=='exon'),
intron = subset(gtf, type=='gene')
)
# promoters() extends the gtf around the TSS by an upstream and downstream amounts
annotateReads = function(bam_file, annotation_list){
message(basename(bam_file))
# load the reads into R
bam = readGAlignments(bam_file)
# find overlaps between reads and annotation
result = as.data.frame(suppressWarnings(findOverlaps(bam, annotation_list)))
result$annotation = names(annotation_list)[result$subjectHits]
# order the overlaps based on the hierarchy
result = result[order(result$subjectHits),]
# select only one category per read
result = subset(result, !duplicated(queryHits))
# count the number of reads in each category
# group the result data frame by the corresponding category
result = group_by(.data=result, annotation)
# count the number of reads in each category
result = summarise(.data = result, counts = length(annotation))
# classify all reads which are outside of the annotation as intergenic
result = rbind(result, data.frame(
annotation = 'intergenic',
counts = length(bam) - sum(result$counts)))
# calculate the frequency
result$frequency = with(result, round(counts/sum(counts),2))
12
# append the experiment name
result$experiment = basename(bam_file)
return(result)
}
# list all bam files in the folder
bam_files = list.files(data, full.names=TRUE, pattern='bam$')
# calculate the read distribution for every file
annot_reads_list = lapply(bam_files, function(x)annotateReads(x, annotation_list))
# Collapse the per-file read distributions into one data.frame
annot_reads_df = dplyr::bind_rows(annot_reads_list)
# Format the experiment names
annot_reads_df$experiment = sub('.chr21.bam','',annot_reads_df$experiment)
annot_reads_df$experiment = sub('GM12878_hg38_','',annot_reads_df$experiment)
ggplot(data = annot_reads_df, aes(experiment, frequency, fill=annotation)) +
geom_bar(stat='identity') +
theme_bw() +
scale_fill_brewer(palette='Set2') +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab('Sample') +
ylab('Percentage of reads') +
ggtitle('Percentage of reads in annotation')
hub = AnnotationHub()
gtf = hub[['AH61126']] #retrieve human gene annotation in the format .gtf into a GRanges object
#paste chr21 prefix to chromosome names
seqlevels(gtf, pruning.mode='coarse') = '21'
seqlevels(gtf, pruning.mode='coarse') = paste0('chr', seqlevels(gtf))
# convert the gtf annotation into a data.base
txdb = makeTxDbFromGRanges(gtf)
#Convert the transcript database into a Gviz track
gene_track = GeneRegionTrack(txdb, chr='chr21', genome='hg38')
Once we have downloaded the annotation, and imported the signal profiles into R we are ready to visualize the data. We will again use the Gviz library.
The ideogram track that will show the position of our current viewpoint on the chromosome, and a genome axis track, which will show the exact coordinates.
# fetches the chromosome length information
hg_chrs = getChromInfoFromUCSC('hg38')
hg_chrs = subset(hg_chrs, (grepl('chr21$',chrom)))
# convert data.frame to named vector
seqlengths = with(hg_chrs, setNames(size, chrom))
# constructs the ideogram track
chr_track = IdeogramTrack('chr21', 'hg38')
# constructs the coordinate system
axis = GenomeAxisTrack(GRanges('chr21', IRanges(1, width=seqlengths)))
# lapply() on the imported bw files to create the track objects
# we loop over experiment names, and select the corresponding object
# within the function chip_profiles[[exp_name]] - selects the proper experiment using the name
data_tracks = lapply(names(chip_profiles), function(exp_name){
DataTrack(chip_profiles[[exp_name]], name = exp_name, type='h', lwd=5)
})
# select the start coordinate for the URB1 gene
start = min(start(subset(gtf, gene_name == 'URB1')))
# select the end coordinate for the URB1 gene
end = max(end(subset(gtf, gene_name == 'URB1')))
# plot the signal profiles around the URB1 gene
plotTracks(c(chr_track, axis, gene_track, data_tracks),
sizes=c(1,1,1,1,1,1), background.title = "black",
collapseTranscripts = "longest",
transcriptAnnotation="symbol",
from = start - 5000,
to = end + 5000)
Use normR for peak calling in sharp and broad peak experiments. NB: normR does not support the usage of biological replicates
hg_chrs = getChromInfoFromUCSC('hg38')
hg_chrs = subset(hg_chrs, grepl('chr21$',chrom))
seqlengths = with(hg_chrs, setNames(size, chrom))
tilling_window = unlist(tileGenome(seqlengths, tilewidth=1000))
counts = summarizeOverlaps(tilling_window, c(chip_file, control_file))
counts = assays(counts)[[1]]
cpm = t(t(counts)*(1000000/colSums(counts)))
# convert the matrix into a data.frame for ggplot
cpm = data.frame(cpm)
ggplot(data = cpm, aes(GM12878_hg38_Input_r5.chr21.bam, GM12878_hg38_CTCF_r1.chr21.bam)) +
geom_point() +
geom_abline(slope = 1) +
theme_bw() +
theme_bw() +
scale_fill_brewer(palette='Set2') +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab('Input CPM') +
ylab('CTCF CPM') +
ggtitle('ChIP Vs Input')
Regions above the diagonal show higher enrichment in the ChIP samples, while the regions below the diagonal show higher enrichment in the Input samples.
Perform for peak calling. normR usage is deceivingly simple – to achieve this, provide the location ChIP and Control read files, and the genome version to the enrichR function. The function will automatically create tilling windows (250bp by default), count the number of reads in each window, and fit a mixture of binomial distributions.
# peak calling using chip and control
ctcf_fit = enrichR(
# ChIP file
treatment = chip_file,
# control file
control = control_file,
# genome version
genome = "hg38",
# print intermediary steps during the analysis
verbose = FALSE)
summary(ctcf_fit)
NormRFit-class object
Type: 'enrichR'
Number of Regions: 12353090
Number of Components: 2
Theta* (naive bg): 0.137
Background component B: 1
+++ Results of fit +++
Mixture Proportions:
Background Class 1
97.72% 2.28%
Theta:
Background Class 1
0.103 0.695
Bayesian Information Criterion: 539882
+++ Results of binomial test +++
T-Filter threshold: 4
Number of Regions filtered out: 12267164
Significantly different from background B based on q-values:
TOTAL:
*** ** * . n.s.
Bins 0 627 120 195 87 84897
% 0.000 0.711 0.847 1.068 1.166 96.209
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 'n.s.'
The summary function shows that most of the regions of the chromosome 21 correspond to the background - 97.72%. In total we have 1029 (627+120+195+87) significantly enriched regions.
getRanges() extracts the regions from the model.
getQvalue(), and getEnrichment() assign statistical significance to regions of interest and calculated enrichment. In order to identify only highly significant regions, keep only ranges where the false discovery rate (q value) is below 0.01.
# extracts the ranges
ctcf_peaks = getRanges(ctcf_fit)
# annotates the ranges with the supporting p value
ctcf_peaks$qvalue = getQvalues(ctcf_fit)
# annotates the ranges with the calculated enrichment
ctcf_peaks$enrichment = getEnrichment(ctcf_fit)
# selects the ranges which correspond to the enriched class
ctcf_peaks = subset(ctcf_peaks, !is.na(component))
# Filter by a stringent q value threshold
ctcf_peaks = subset(ctcf_peaks, qvalue < 0.01)
# order the peaks based on the q value
ctcf_peaks = ctcf_peaks[order(ctcf_peaks$qvalue)]
After stringent q value filtering we are left with 724 peaks
seqlevels(ctcf_peaks, pruning.mode='coarse') = 'chr21'
analysis:
write.table(ctcf_peaks, file.path(data, 'CTCF_peaks.txt'),
row.names=F, col.names=T, quote=F, sep='\t')
# find enriched tilling windows
enriched_regions = countOverlaps(tilling_window, ctcf_peaks) > 0
cpm$enriched_regions = enriched_regions
ggplot(data = cpm, aes(GM12878_hg38_Input_r5.chr21.bam, GM12878_hg38_CTCF_r1.chr21.bam,
color=enriched_regions)) +
geom_point() +
geom_abline(slope = 1) +
theme_bw() +
scale_fill_brewer(palette='Set2') +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab('Input CPM') +
ylab('CTCF CPM') +
ggtitle('ChIP Vs Input') +
scale_color_manual(values=c('gray','red'))
normR identified all of the regions above the diagonal as statistically significant. However, it labeled a significant number of regions below the diagonal, because normR has greater sensitivity and these peaks might really be enriched regions.
This will show the kind of signal properties that have contributed to the peak calling. Expect to see a strong, bell shaped, enrichment in the ChIP sample, and a uniform noise in the Input sample. The following function takes as input a .bam file, and loads the bam into R. It extends the reads to a size of 200 bp, and creates the coverage vector.
# calculate the coverage for one bam file
calculateCoverage = function(bam_file, extend = 200)
{
reads = readGAlignments(bam_file) #Load reads into R
reads = granges(reads) #Convert reads into a GRanges object
reads = resize(reads, width=extend, fix='start') #Resize the reads to 200bp
cov = coverage(reads) #Get the coverage vector
cov = round(cov * (1000000/length(reads)),2)#Normalize coverage vector to the sequencing depth
cov = as(cov, 'GRanges')#Convert the coverate go a GRanges object
seqlevels(cov, pruning.mode='coarse') = 'chr21' #keep only chromosome 21
return(cov)
}
# calculate coverage for the ChIP file
ctcf_cov = calculateCoverage(chip_file)
# calculate coverage for the control file
cont_cov = calculateCoverage(control_file)
then the peak locations and the signal files:
chr_track = IdeogramTrack('chr21', 'hg38')
axis = GenomeAxisTrack(GRanges('chr21', IRanges(1, width=seqlengths)))
peaks_track = AnnotationTrack(ctcf_peaks, name = "CTCF Peaks")
chip_track = DataTrack(ctcf_cov, name = "CTCF", type='h', lwd=3)
cont_track = DataTrack(cont_cov, name = "Input", type='h', lwd=3)
plotTracks(list(chr_track,
axis,
peaks_track,
chip_track,
cont_track),
sizes=c(.2,.5,.5,1,1), background.title = "black",
from = start(ctcf_peaks)[1] - 1000,
to = end(ctcf_peaks)[1] + 1000)
Although the Input sample shows an enrichment, it is important to compare the scales on both samples. The normalized ChIP signal goes up to 2500, while the maximum value in the input sample is only 60.
Using normR() to call peaks for the H3K36me3 histone modification, which is associated with gene bodies of expressed genes.
The H3K36 regions span broad domains, thus it is necessary to increase the tilling window size which will be used for counting. Using the countConfiguration () set the tilling window size to 5000 base pairs
#Define the window width for the counting
countConfiguration = countConfigSingleEnd(binsize = 5000)
#Find broad peaks using enrichR
h3k36_fit = enrichR(
treatment = chip_file,#ChIP file
control = control_file,#Control file
genome = "hg38", #Genome version
verbose = FALSE,
countConfig = countConfiguration)# window size for counting
summary(h3k36_fit)
NormRFit-class object
Type: 'enrichR'
Number of Regions: 617665
Number of Components: 2
Theta* (naive bg): 0.197
Background component B: 1
+++ Results of fit +++
Mixture Proportions:
Background Class 1
85.4% 14.6%
Theta:
Background Class 1
0.138 0.442
Bayesian Information Criterion: 741525
+++ Results of binomial test +++
T-Filter threshold: 5
Number of Regions filtered out: 610736
Significantly different from background B based on q-values:
TOTAL:
*** ** * . n.s.
Bins 0 1005 314 381 237 4992
% 0.00 9.18 12.04 15.52 17.68 45.58
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 'n.s.'
The summary function shows that we get 1937 enriched regions.
# get the locations of broad peaks
h3k36_peaks = getRanges(h3k36_fit)
# extract the qvalue and enrichment
h3k36_peaks$qvalue = getQvalues(h3k36_fit)
h3k36_peaks$enrichment = getEnrichment(h3k36_fit)
# select proper peaks
h3k36_peaks = subset(h3k36_peaks, !is.na(component))
h3k36_peaks = subset(h3k36_peaks, qvalue < 0.01)
h3k36_peaks = h3k36_peaks[order(h3k36_peaks$qvalue)]
# collapse nearby enriched reginos
h3k36_peaks = reduce(h3k36_peaks)
# construct the data tracks for the H3k36me3 and Input files
h3k36_cov = calculateCoverage(chip_file)
data_tracks = list(
h3k36 = DataTrack(h3k36_cov, name = 'h3k36_cov', type='h', lwd=3),
input = DataTrack(cont_cov, name = 'Input', type='h', lwd=3))
# define the window for the visualization
start = min(start(h3k36_peaks[2])) - 25000
end = max(end(h3k36_peaks[2])) + 25000
# create the peak track
peak_track = AnnotationTrack(reduce(h3k36_peaks), name='h3k36me3')
# plots the enriched region
plotTracks(c(chr_track, axis, gene_track, peak_track, data_tracks),
sizes=c(.5,.5,.5,.1,1,1), background.title = "black",
collapseTranscripts = "longest", transcriptAnnotation="symbol",
from = start,
to = end)
The figure shows a highly enriched H3K36me3 region covering the gene body, as expected.
To calculate the reads in peaks, extract the number of reads in each tilling window from the normR produced fit object. This is done using the getCounts function. Then use the q value to define which tilling windows correspond to peaks, and count the number of reads within and outside peaks.
#Extract, per tilling window, counts from the fit object
h3k36_counts = data.frame(getCounts(h3k36_fit))
#Change the column names of the data.frame
colnames(h3k36_counts) = c('Input','H3K36me3')
#Extract the q value corresponding to each bin
h3k36_counts$qvalue = getQvalues(h3k36_fit)
#Define which regions are peaks using a q value cutoff
h3k36_counts$enriched[is.na(h3k36_counts$qvalue)] = 'Not Peak'
h3k36_counts$enriched[h3k36_counts$qvalue > 0.05] = 'Not Peak'
h3k36_counts$enriched[h3k36_counts$qvalue <= 0.05] = 'Peak'
#Remove the q value column
h3k36_counts$qvalue = NULL
#Reshape the data.frame into a long format
h3k36_counts_df = tidyr::gather(data=h3k36_counts, experiment, counts, -enriched)
#Sum the number of reads in the Peak and Not Peak regions
h3k36_counts_df = group_by(.data = h3k36_counts_df, experiment, enriched)
h3k36_counts_df = summarize(.data = h3k36_counts_df, num_of_reads = sum(counts))
#Calculate the percentage of reads.
h3k36_counts_df = group_by(.data = h3k36_counts_df, experiment)
h3k36_counts_df = mutate(.data = h3k36_counts_df, total=sum(num_of_reads))
h3k36_counts_df$percentage = with(h3k36_counts_df, round(num_of_reads/total,2))
ggplot(data = h3k36_counts_df, aes(experiment, percentage, fill=enriched)) +
geom_bar(stat='identity', position='dodge') +
theme_bw() +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=12,face="bold"),
plot.title = element_text(hjust = 0.5)) +
xlab('Experiment') +
ylab('Percetage of reads in region') +
ggtitle('Percentage of reads in peaks for H3K36me3') +
scale_fill_manual(values=c('gray','red'))
The figure shows that the ChIP sample is clearly enriched in the peak regions. The percentage of read in peaks will depend on the quality of the antibody (strength of enrichment), and the size of peaks which are bound by the protein of interest. If the total size of peaks is small, relative to the genome size, we can expect that the percentage of reads in peaks will be small. For in depth consideration about the appropriate percentage thresholds see (ref)
#Fetch the CTCF motif from the data base
motifs = query(query(MotifDb, 'Hsapiens'), 'CTCF')
#Show all available ctcf motifs
motifs
MotifDb object of length 12
| Created from downloaded public sources: 2013-Aug-30
| 12 position frequency matrices from 8 sources:
| HOCOMOCOv10: 2
| HOCOMOCOv11-core-A: 2
| JASPAR_2014: 1
| JASPAR_CORE: 1
| jaspar2016: 1
| jaspar2018: 2
| jolma2013: 1
| SwissRegulon: 2
| 1 organism/s
| Hsapiens: 12
Hsapiens-HOCOMOCOv10-CTCFL_HUMAN.H10MO.A
Hsapiens-HOCOMOCOv10-CTCF_HUMAN.H10MO.A
Hsapiens-HOCOMOCOv11-core-A-CTCFL_HUMAN.H11MO.0.A
Hsapiens-HOCOMOCOv11-core-A-CTCF_HUMAN.H11MO.0.A
Hsapiens-JASPAR_CORE-CTCF-MA0139.1
...
Hsapiens-jaspar2018-CTCF-MA0139.1
Hsapiens-jaspar2018-CTCFL-MA1102.1
Hsapiens-jolma2013-CTCF
Hsapiens-SwissRegulon-CTCFL.SwissRegulon
Hsapiens-SwissRegulon-CTCF.SwissRegulon
ctcf_motif = motifs[[9]]
The downloaded motif is a matrix with dimensions 4 X N. The matrix contains 4 rows where each row corresponds to one nucleotide (A, C, G, T). The number of columns designates the width of the region bound by the transcription factor and each element of the matrix contains the probability of observing the corresponding nucleotide on this position. For example, for following the CTCF matrix, the probability of observing a thymine at the first position of the motif is 0.57 (1st column, 4th row).
Such matrix, where each column is a probability distribution of over a sequence of nucleotides is called a position frequency matrix - PFM. These matrices are constructed from sequences which were experimentally verified sequences, using a motif finding algorithm.
motif:
# plot the seqLogo of the CTCF motif
seqLogo::seqLogo(ctcf_motif)
The figure above shows the CTCF sequence motif visualized as a sequence logo. Y axis ranges from zero to two, and corresponds to the amount of information each base in the motif contributes to the overall motif. The larger the letter the greater the probability of observing just one defined base on the designated position.
ctcf_peaks_resized = resize(ctcf_peaks, width = 400, fix='center')
Extend the peak regions to +/- 200 bp around the peak center. Because the average fragment size is 200 bp, 400 nucleotides is the expected variation in the position of the true binding location.
peak regions:
# extract the sequences around the peaks
seq = getSeq(BSgenome.Hsapiens.UCSC.hg38, ctcf_peaks_resized)
Using the TFBSTools package. Convert the raw probability matrix into a PWMMatrix object, then use for efficient scanning.
# Convert the matrix into a PWM object and scan the peaks
ctcf_pwm = PWMatrix(ID='CTCF', profileMatrix = ctcf_motif)
hits = searchSeq(ctcf_pwm, seq, min.score="80%", strand="*")
hits = as.data.frame(hits)
searchSeq() scans each sequence for the motif occurrence.
# label which peaks contain CTCF motifs
motif_hits_df = data.frame(
peak_order = 1:length(ctcf_peaks)
)
motif_hits_df$contains_motif = motif_hits_df$peak_order %in% hits$seqnames
motif_hits_df = motif_hits_df[order(-motif_hits_df$peak_order),]
# calculate the percentage of peaks with motif for peaks of descending strength
motif_hits_df$perc_peaks = with(motif_hits_df,
cumsum(contains_motif) / max(peak_order))
motif_hits_df$perc_peaks = round(motif_hits_df$perc_peaks, 2)
#Plot the cumulative distribution of motif hit percentages
ggplot(
motif_hits_df,
aes(
x = peak_order,
y = perc_peaks
)) +
geom_line(size=2) +
theme_bw() +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5)) +
xlab('Peak rank') +
ylab('Percetage of peaks with motif') +
ggtitle('Percentage of CTCF peaks with the CTCF motif')
If the ChIP experiment was performed properly, we would expect the motif to be localized just below the summit of each peak. By plotting the motif localization around ChIP peaks, we are quantifying the uncertainty in peak location.
# resize the region around peaks to +/- 1kb
ctcf_peaks_resized = resize(ctcf_peaks, width = 2000, fix='center')
# fetch the sequence
seq = getSeq(BSgenome.Hsapiens.UCSC.hg38,ctcf_peaks_resized)
# convert the motiv matrix to PWM, and scan the peaks
ctcf_pwm = PWMatrix(ID='CTCF', profileMatrix = ctcf_motif)
hits = searchSeq(ctcf_pwm, seq, min.score="80%", strand="*")
hits = as.data.frame(hits)
# set the position relative to the start
hits$position = hits$start - 1000
# plot the motif hits around peaks
ggplot(data=hits, aes(position)) +
geom_density(size=2) +
theme_bw() +
geom_vline(xintercept = 0, linetype=2, color='red', size=2) +
xlab('Position around the CTCF peaks') +
ylab('Per position percentage\nof motif occurence') +
theme(
axis.text = element_text(size=10, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5))
The final step of quality control is to visualize the distribution of peaks in different functional genomic regions. The purpose of the analysis is to check whether the location of the peaks conforms with our prior knowledge. This analysis is equivalent to constructing distributions for reads.
# Download the annotation
hub = AnnotationHub()
gtf = hub[['AH61126']]
seqlevels(gtf, pruning.mode='coarse') = '21'
seqlevels(gtf, pruning.mode='coarse') = paste0('chr', seqlevels(gtf))
# Create the annotation hierarchy
annotation_list = GRangesList(
tss = promoters(subset(gtf, type=='gene'), 1000, 1000),
exon = subset(gtf, type=='exon'),
intron = subset(gtf, type=='gene')
)
#Annotate the location of each peak
annotatePeaks = function(peaks, annotation_list, name){
#Collapse touching enriched regions
peaks = reduce(peaks)
#Find overlaps between the peaks and annotation_list
result = as.data.frame(suppressWarnings(findOverlaps(peaks, annotation_list)))
#Fetch annotation names
result$annotation = names(annotation_list)[result$subjectHits]
#Rank by annotation precedence
result = result[order(result$subjectHits),]
#Remove overlapping annotations
result = subset(result, !duplicated(queryHits))
#Count the number of peaks in each annotation category
result = group_by(.data = result, annotation)
result = summarise(.data = result, counts = length(annotation))
#Fetch the number of intergenic peaks
result = rbind(result,
data.frame(annotation = 'intergenic',
counts = length(peaks) - sum(result$counts)))
result$frequency = with(result, round(counts/sum(counts),2))
result$experiment = name
return(result)
}
peak_list = list(CTCF = ctcf_peaks,
H3K36me3 = h3k36_peaks)
#Ccalculate the distribution of peaks in annotation for each experiment
annot_peaks_list = lapply(names(peak_list), function(peak_name){
annotatePeaks(peak_list[[peak_name]], annotation_list, peak_name)
})
#Combine a list of data.frames into one data.frame
annot_peaks_df = dplyr::bind_rows(annot_peaks_list)
#Plot the distrubtion of peaks in genomic features
ggplot(data = annot_peaks_df, aes(experiment, frequency, fill=annotation)) +
geom_bar(stat='identity') +
scale_fill_brewer(palette='Set2') +
theme_bw()+
theme(
axis.text = element_text(size=18, face='bold'),
axis.title = element_text(size=14,face="bold"),
plot.title = element_text(hjust = 0.5)) +
ggtitle('Peak distribution in\ngenomic regions') +
xlab('Experiment') +
ylab('Frequency')
The plot shows that the H3K36me3 peaks are located preferentially in gene bodies, as expected, while the CTCF peaks are found preferentially in introns.
#Read the CTCF peaks created in the peak calling part of the tutorial
ctcf_peaks = read.table(file.path(data, 'CTCF_peaks.txt'), header=TRUE)
#Convert the peaks into a GRanges object
ctcf_peaks = makeGRangesFromDataFrame(ctcf_peaks, keep.extra.columns = TRUE)
#Order the peaks by qvalue, and take top 500 peaks
ctcf_peaks = ctcf_peaks[order(ctcf_peaks$qvalue)]
ctcf_peaks = head(ctcf_peaks, n = 500)
#Merge nearby CTCF peaks
ctcf_peaks = reduce(ctcf_peaks)
This can be achieved using the sample function on a DNAString object. Before the sample function, set the seed for the random number generator.
# set the seed for the random number generator
set.seed(1)
# shuffle the ctcf sequences
back_seq = DNAStringSet(lapply(ctcf_seq, sample))
Ready to do motif discovery. This is executed using the findMotifFgBg(). The primary two arguments are the foreground and the background sequence data sets. We additionally define the maximal number of motifs, that we are interested only in motifs which are enriched in the foreground data sets, and that the motif width should be at least 15 base pairs. both.strand parameter designates that the motifs could be located both on the supplied sequences and their reverse complements.
novel_motifs <- GADEM(ctcf_seq, seed=1, nmotifs=3, genome=Hsapiens)
top 3 4, 5-mers: 20 40 60
top 3 4, 5-mers: 16 40 60
top 3 4, 5-mers: 14 40 60
Maximal number of motifs (3) reached
# visualize the resulting motif
plot(novel_motifs[1])
The motif shown in Figure above corresponds to the previously visualized CTCF motif. Nevertheless, we will computationally annotate our motif by querying the JASPAR database in the next section.
Compare our unknown motif with the JASPAR2018 database to figure out which transcription factor it corresponds to.
First, convert the frequency matrix into a PWMatrix object, and then use this object to query the database.
# load the JASPAR motif database
library(JASPAR2018)
# extract motifs corresponding to human transcription factors
pwm_library = getMatrixSet(
JASPAR2018,
opts=list(
collection = 'CORE',
species = 'Homo sapiens',
matrixtype = 'PWM'
))
# find the most similar motif to our motif
pwm_sim = PWMSimilarity(
# JASPAR library
pwm_library,
# out motif
unknown_pwm,
# measure for comparison
method = 'Pearson')
For each motif in the library we append the Pearson correlation with our unknown motif, and look at the topmost candidates.
# extract the motif names from the pwm library
pwm_library_list = lapply(pwm_library, function(x){
data.frame(ID = ID(x), name = name(x))
})
# combine the list into one data frame
pwm_library_dt = dplyr::bind_rows(pwm_library_list)
# fetch the similarity of each motif to our unknown motif
pwm_library_dt$similarity = pwm_sim[pwm_library_dt$ID]
# find the most similar motif in the library
pwm_library_dt = pwm_library_dt[order(-pwm_library_dt$similarity),]
head(pwm_library_dt)
ID name similarity
204 MA0738.1 HIC2 0.4210522
387 MA0147.3 MYC 0.4085302
95 MA0027.2 EN1 0.3847860
296 MA0820.1 FIGLA 0.3844179
5 MA0056.1 MZF1 0.3839454
130 MA0668.1 NEUROD2 0.3804663
As expected, the topmost candidate is CTCF.