1 Summary

In this case study, we perform differential peak calling on ChIP-seq data of the histone H3K4Me3 for samples from two cell lines (K562 and Gm12878) using publicly available data generated by the ENCODE Project. The same data set is used for all ChIP-seq differential testing case studies.

While several approaches have been proposed for differential peak calling, in this case study we examine a simple approach of testing for differential peaks in the promoter regions of all genes. While using pre-defined regions can reduce the power of the test to detect differences in non-promoter regions, since H3K4Me3 is an active marker of gene expression, restricting tests to promoter regions is not unreasonable. Differential peaks are tested using DESeq2.

2 Workspace Setup

library(dplyr)
library(ggplot2)
library(SummarizedBenchmark)
library(BiocParallel)
library(GenomicRanges)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(DESeq2)
library(Rsubread)

## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
    source(f)
}

## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/ChIPseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)

## intermediary files we create below
count_file <- file.path(resdir, "h3k4me3-promoters-counts.rds")
result_file <- file.path(resdir, "h3k4me3-promoters-results.rds")
bench_file <- file.path(sbdir, "h3k4me3-promoters-benchmark.rds")

## set up parallel backend
cores <- as.numeric(Sys.getenv("SLURM_NTASKS"))
multicoreParam <- MulticoreParam(workers = cores)

3 Data Preparation

We downloaded the bam files directly from UCSC ENCODE portal.

broad_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHistone/"
broad_bams <- c("wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bam",
                "wgEncodeBroadHistoneGm12878H3k04me3StdAlnRep2V2.bam",
                "wgEncodeBroadHistoneK562H3k4me3StdAlnRep1.bam",
                "wgEncodeBroadHistoneK562H3k4me3StdAlnRep2.bam")

for (i_bam in broad_bams) {
    if (!file.exists(file.path(datdir, i_bam))) {
        download.file(paste0(broad_url, i_bam),
                      destfile = file.path(datdir, i_bam))
    }
    i_bai <- paste0(i_bam, ".bai")
    if (!file.exists(file.path(datdir, i_bai))) {
        download.file(paste0(broad_url, i_bai),
                      destfile = file.path(datdir, i_bai))
    }
}

uw_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwHistone/"
uw_bams <- c("wgEncodeUwHistoneGm12878H3k4me3StdAlnRep1.bam",
             "wgEncodeUwHistoneGm12878H3k4me3StdAlnRep2.bam",
             "wgEncodeUwHistoneK562H3k4me3StdAlnRep1.bam",
             "wgEncodeUwHistoneK562H3k4me3StdAlnRep2.bam")

for (i_bam in uw_bams) {
    if (!file.exists(file.path(datdir, i_bam))) {
        download.file(paste0(uw_url, i_bam),
                      destfile = file.path(datdir, i_bam))
    }
    i_bai <- paste0(i_bam, ".bai")
    if (!file.exists(file.path(datdir, i_bai))) {
        download.file(paste0(uw_url, i_bai),
                      destfile = file.path(datdir, i_bai))
    }
}

We determine sample metadata from the file names.

bamfiles <- c(broad_bams, uw_bams)
labs <- gsub("wgEncode(.*)Histone.*", "\\1", basename(bamfiles))
cells <- gsub("wgEncode.*Histone(.*)H3k.*", "\\1", basename(bamfiles))

meta <- data.frame(cellline = cells, lab = labs, file = bamfiles)

Next, we identify promotor regions using the UCSC “Known Gene” annotations for human genome assembly hg19 (GRCh37).

prom <- promoters(genes(TxDb.Hsapiens.UCSC.hg19.knownGene))
prom <- keepSeqlevels(prom, paste0("chr", c(1:22, "X", "Y", "M")),
                      pruning.mode = "coarse")

3.1 Read Counting

We count reads for all promoter regions.

if (file.exists(count_file)) {
    rc <- readRDS(count_file)
} else {
    anno <- data.frame(GeneID = seq_len(length(prom)),
                       Chr = seqnames(prom),
                       Start = start(prom),
                       End = end(prom),
                       Strand = strand(prom))
    rc <- featureCounts(files = file.path(datdir, bamfiles),
                        annot.ext = anno,
                        allowMultiOverlap = TRUE,
                        minOverlap = 50,
                        readExtension3 = 150,
                        ignoreDup = TRUE)$counts
    saveRDS(rc, file = count_file)
}

4 Data Analysis

4.1 Differential Testing

We test for differential binding at each of the promoters.

if (file.exists(result_file)) {
    dat <- readRDS(result_file)
} else {
    dds <- DESeqDataSetFromMatrix(countData = rc,
                                  colData = as.data.frame(meta),
                                  design = ~ lab + cellline)
    dds <- DESeq(dds, fitType = "mean")
    dat <- as.data.frame(results(dds, independentFiltering = FALSE))
    colnames(dat) <- c('ind_covariate', 'effect_size', 'SE',
                       'test_statistic', 'pval', 'padj')
    dat <- dat[!is.na(dat$effect_size), ]
    saveRDS(dat, file = result_file)
}

4.2 Covariate Diagnostics

Here, we can check diagnostic plots to assess whether candidate covariates are actually informative.

4.2.1 Mean Coverage

We consider the average coverage across conditions and replicates as a potential covariate of interest. This is motivated by our prior assumption that ChIP-seq data always involves two sets of regions: signal and background. Sometimes, there are a few regions with signal coverage between signal and background. In this example, we see that p-values are differently distributed under low, median and high coverage.

rank_scatter(dat, pvalue = "pval", covariate = "ind_covariate")

strat_hist(dat, pvalue = "pval", covariate = "ind_covariate",
           maxy = 30)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:rlang':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

4.3 Multiple-Testing Correction

We use the common BenchDesign with the set of multiple testing correction methods already included. Now, we’re ready to construct the SummarizedBenchmark object, which will run the functions specified in each method. We also add in Scott’s FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are normally-distributed.

if (file.exists(bench_file)) {
    sb <- readRDS(bench_file)
} else {
    bd <- initializeBenchDesign()
    bd <- addBMethod(bd, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))
    bd <- addBMethod(bd, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

    sb <- buildBench(bd, data = dat, ftCols = c("ind_covariate"),
                     parallel = TRUE, BPPARAM = multicoreParam)
    saveRDS(sb, file = bench_file)
}

4.4 Benchmark Metrics

Next, we’ll add the default performance metric for q-value assays. First, we have to rename the assay to ‘qvalue’.

assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)

Now, we’ll plot the results.

rejections_scatter(sb, supplementary = FALSE)

rejection_scatter_bins(sb, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)

plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)

covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")

5 Session Info

sessionInfo()

## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2                         
##  [2] magrittr_1.5                           
##  [3] cowplot_0.9.2                          
##  [4] hexbin_1.27.2                          
##  [5] Rsubread_1.30.3                        
##  [6] DESeq2_1.20.0                          
##  [7] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [8] GenomicFeatures_1.32.0                 
##  [9] AnnotationDbi_1.42.1                   
## [10] SummarizedBenchmark_0.99.1             
## [11] mclust_5.4                             
## [12] stringr_1.3.1                          
## [13] rlang_0.2.1                            
## [14] UpSetR_1.3.3                           
## [15] SummarizedExperiment_1.10.1            
## [16] DelayedArray_0.6.1                     
## [17] BiocParallel_1.14.2                    
## [18] matrixStats_0.53.1                     
## [19] Biobase_2.40.0                         
## [20] GenomicRanges_1.32.3                   
## [21] GenomeInfoDb_1.16.0                    
## [22] IRanges_2.14.10                        
## [23] S4Vectors_0.18.3                       
## [24] BiocGenerics_0.26.0                    
## [25] tidyr_0.8.1                            
## [26] ggplot2_3.0.0                          
## [27] dplyr_0.7.5                            
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6             bit64_0.9-7             
##  [3] RColorBrewer_1.1-2       progress_1.1.2          
##  [5] httr_1.3.1               rprojroot_1.3-2         
##  [7] tools_3.5.0              backports_1.1.2         
##  [9] R6_2.2.2                 rpart_4.1-13            
## [11] Hmisc_4.1-1              DBI_1.0.0               
## [13] lazyeval_0.2.1           colorspace_1.3-2        
## [15] nnet_7.3-12              withr_2.1.2             
## [17] tidyselect_0.2.4         gridExtra_2.3           
## [19] prettyunits_1.0.2        bit_1.1-14              
## [21] compiler_3.5.0           htmlTable_1.12          
## [23] labeling_0.3             rtracklayer_1.40.3      
## [25] scales_0.5.0             checkmate_1.8.5         
## [27] genefilter_1.62.0        digest_0.6.15           
## [29] Rsamtools_1.32.0         foreign_0.8-70          
## [31] rmarkdown_1.10           XVector_0.20.0          
## [33] base64enc_0.1-3          pkgconfig_2.0.1         
## [35] htmltools_0.3.6          htmlwidgets_1.2         
## [37] rstudioapi_0.7           RSQLite_2.1.1           
## [39] bindr_0.1.1              acepack_1.4.1           
## [41] RCurl_1.95-4.10          GenomeInfoDbData_1.1.0  
## [43] Formula_1.2-3            Matrix_1.2-14           
## [45] Rcpp_0.12.17             munsell_0.4.3           
## [47] stringi_1.2.2            yaml_2.1.19             
## [49] zlibbioc_1.26.0          plyr_1.8.4              
## [51] grid_3.5.0               blob_1.1.1              
## [53] lattice_0.20-35          Biostrings_2.48.0       
## [55] splines_3.5.0            annotate_1.58.0         
## [57] locfit_1.5-9.1           knitr_1.20              
## [59] pillar_1.2.3             geneplotter_1.58.0      
## [61] biomaRt_2.36.1           XML_3.98-1.11           
## [63] glue_1.2.0               evaluate_0.10.1         
## [65] latticeExtra_0.6-28      data.table_1.11.4       
## [67] gtable_0.2.0             purrr_0.2.5             
## [69] assertthat_0.2.0         xtable_1.8-2            
## [71] survival_2.41-3          tibble_1.4.2            
## [73] GenomicAlignments_1.16.0 memoise_1.1.0           
## [75] cluster_2.0.7-1

Case Study: ChIP-seq Differential Peak Calling (pre-defined regions)

Mingxiang Teng, Patrick Kimes

October 30, 2018