In this case study, we perform differential peak calling on ChIP-seq data of the histone H3K4Me3 for samples from two cell lines (K562 and GM12878) using publicly available data generated by the ENCODE Project. The same data set is used for all ChIP-seq differential testing case studies.
The csaw package provides a powerful and flexible approach to differential peak calling based on counting reads in sliding windows across the genome. The sliding window approach allows for the detection of differential peaks over sharper or wider regions, without being constrained by the original peak boundaries obtained from the peak calling algorithm, e.g. MACS. Differential peaks are called at each window using edgeR and p-values are combined within adjacent windows (clusters).
For this analysis, the default csaw settings are used as described in the csaw “User Guide.”
library(dplyr)
library(ggplot2)
library(SummarizedBenchmark)
library(BiocParallel)
library(rtracklayer)
library(Rsamtools)
library(Rsubread)
library(csaw)
library(edgeR)
library(GenomicAlignments)
## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
source(f)
}
## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/ChIPseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)
## intermediary files we create below
count_file <- file.path(resdir, "h3k4me3-csaw-counts.rds")
filtered_file <- file.path(resdir, "h3k4me3-csaw-counts-filtered.rds")
result_file <- file.path(resdir, "h3k4me3-csaw-results.rds")
bench_file <- file.path(sbdir, "h3k4me3-csaw-benchmark.rds")
bench_file_cov <- file.path(sbdir, "h3k4me3-csaw-cov-benchmark.rds")
bench_file_uninf <- file.path(sbdir, "h3k4me3-csaw-uninf-benchmark.rds")
## set up parallel backend
cores <- as.numeric(Sys.getenv("SLURM_NTASKS"))
multicoreParam <- MulticoreParam(workers = cores)
We download the fastq files directly from the UCSC ENCODE portal.
broad_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHistone/"
broad_fastqs <- c("wgEncodeBroadHistoneGm12878H3k4me3StdRawDataRep1.fastq.gz",
"wgEncodeBroadHistoneGm12878H3k04me3StdRawDataRep2V2.fastq.gz",
"wgEncodeBroadHistoneK562H3k4me3StdRawDataRep1.fastq.gz",
"wgEncodeBroadHistoneK562H3k4me3StdRawDataRep2.fastq.gz")
for (i_fq in broad_fastqs) {
if (!file.exists(file.path(datdir, i_fq))) {
download.file(paste0(broad_url, i_fq),
destfile = file.path(datdir, i_fq))
}
}
uw_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwHistone/"
uw_fastqs <- c("wgEncodeUwHistoneGm12878H3k4me3StdRawDataRep1.fastq.gz",
"wgEncodeUwHistoneGm12878H3k4me3StdRawDataRep2.fastq.gz",
"wgEncodeUwHistoneK562H3k4me3StdRawDataRep1.fastq.gz",
"wgEncodeUwHistoneK562H3k4me3StdRawDataRep2.fastq.gz")
for (i_fq in uw_fastqs) {
if (!file.exists(file.path(datdir, i_fq))) {
download.file(paste0(uw_url, i_fq),
destfile = file.path(datdir, i_fq))
}
}
We use the GRCh38 reference genome used in the ENCODE3 analysis. If the reference index is not already available, we must first download the reference file. Note that this is a different reference than what was originally used for the data in earlier analyses of the ENCODE project.
ref_url <- paste0("https://www.encodeproject.org/files/",
"GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/",
"GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz")
ref_fastagz <- file.path(datdir, basename(ref_url))
ref_fasta <- gsub("\\.gz$", "", ref_fastagz)
if (!file.exists(file.path(datdir, "ref_index.00.b.tab"))) {
if (!file.exists(ref_fasta)) {
download.file(ref_url, destfile = ref_fastagz)
system(paste("gunzip", ref_fastagz))
}
buildindex(basename = file.path(datdir, "ref_index"),
reference = ref_fasta)
}
We determine sample metadata from the file names.
fqfiles <- file.path(datdir, c(broad_fastqs, uw_fastqs))
labs <- gsub("wgEncode(.*)Histone.*", "\\1", basename(fqfiles))
cells <- gsub("wgEncode.*Histone(.*)H3k.*", "\\1", basename(fqfiles))
meta <- data.frame(cellline = cells, lab = labs,
fqfile = fqfiles,
bamfile = gsub("\\.fastq.gz", ".bam", fqfiles),
sortedfile = gsub("\\.fastq.gz", ".sorted.bam", fqfiles),
stringsAsFactors = FALSE)
We also download blacklisted regions for GRCh38 from ENCODE (https://www.encodeproject.org/annotations/ENCSR636HFF/).
blacklist_url <- paste0("http://mitra.stanford.edu/kundaje/akundaje/",
"release/blacklists/hg38-human/hg38.blacklist.bed.gz")
if (!file.exists(file.path(datdir, basename(blacklist_url)))) {
download.file(blacklist_url, destfile = file.path(datdir, basename(blacklist_url)))
}
bl <- import(file.path(datdir, basename(blacklist_url)))
Standard set of parameters are defined for the analysis. Only the canonical set of chromosomes are used in the analysis.
std_chr <- paste0("chr", c(1:22, "X", "Y"))
param <- readParam(minq = 20, discard = bl, restrict = std_chr)
We count reads within sliding windows across the genome.
if (file.exists(count_file)) {
win_cnts <- readRDS(count_file)
} else {
## Broad samples have quality score w/ phred offset +33 (standard)
b_meta <- meta[meta$lab == "Broad", ]
b_unaligned <- !file.exists(b_meta$bamfile)
if (any(b_unaligned)) {
align(index = file.path(datdir, "ref_index"),
readfile1 = b_meta$fqfile[b_unaligned],
TH1 = 2, type = 1, phredOffset = 33,
output_file = b_meta$bamfile[b_unaligned])
}
## UW samples have quality score w/ phred offset +64 (old Illumina scale)
u_meta <- meta[meta$lab == "Uw", ]
u_unaligned <- !file.exists(u_meta$bamfile)
if (any(u_unaligned)) {
align(index = file.path(datdir, "ref_index"),
readfile1 = u_meta$fqfile[u_unaligned],
TH1 = 2, type = 1, phredOffset = 64,
output_file = u_meta$bamfile[u_unaligned])
}
## sort bam files
for (i in 1:nrow(meta)) {
if (!file.exists(meta$sortedfile[i])) {
sortBam(meta$bamfile[i], gsub("\\.bam$", "", meta$sortedfile[i]))
}
}
## mark duplicates w/ picard and index bam files
if (any(!file.exists(paste0(meta$sortedfile, ".bai")))) {
temp_bam <- file.path(datdir, "temp_dups.bam")
temp_file <- file.path(datdir, "temp_mets.txt")
temp_dir <- file.path(datdir, "temp_dups")
dir.create(temp_dir)
for (i_bam in meta$sortedfile) {
code <- paste0("java -jar ${PICARD_TOOLS_HOME}/picard.jar ",
"MarkDuplicates I=%s O=%s M=%s ",
"TMP_DIR=%s AS=true REMOVE_DUPLICATES=false ",
"VALIDATION_STRINGENCY=SILENT")
code <- sprintf(code, i_bam, temp_bam, temp_file, temp_dir)
code <- system(code)
stopifnot(code == 0L)
file.rename(temp_bam, i_bam)
}
unlink(temp_file)
unlink(temp_dir, recursive = TRUE)
indexBam(meta$sortedfile)
}
## use correlateReads to determine fragment length (remove dups)
b_x <- correlateReads(b_meta$sortedfile, param = reform(param, dedup = TRUE))
u_x <- correlateReads(u_meta$sortedfile, param = reform(param, dedup = TRUE))
b_frag_len <- which.max(b_x) - 1
u_frag_len <- which.max(u_x) - 1
## count reads in sliding windows (keep dups)
win_cnts <- windowCounts(meta$sortedfile, param = param, width = 150,
ext = list(rep(c(b_frag_len, u_frag_len), each = 4), 150))
## save unfiltered counts
saveRDS(win_cnts, file = count_file)
}
We can apply prefiltering on windows with low abundance as described in Lun and Smyth (2016).
if (file.exists(filtered_file)) {
filtered_cnts <- readRDS(filtered_file)
} else {
## filter windows by abundance
bins <- windowCounts(meta$sortedfile, bin = TRUE, width = 2000, param = param)
filter_stat <- filterWindows(win_cnts, bins, type = "global")
keep <- filter_stat$filter > log2(2)
filtered_cnts <- win_cnts[keep, ]
## save filtered counts
saveRDS(filtered_cnts, file = filtered_file)
}
We use edgeR to test for differential binding with the filtered data.
if (file.exists(result_file)) {
res_ranges <- readRDS(result_file)
} else {
## normalize for library-specific trended biases
filtered_cnts <- normOffsets(filtered_cnts, type = "loess")
## create model design
design <- model.matrix(~ lab + cellline,
data = as.data.frame(meta))
## run quasi-likelihood F-test
y <- asDGEList(filtered_cnts)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design, robust = TRUE)
res <- glmQLFTest(fit)
## merge p-values across regions
merged <- mergeWindows(rowRanges(filtered_cnts), tol = 100, max.width = 5000)
tab_comb <- combineTests(merged$id, res$table)
tab_best <- getBestTest(merged$id, res$table)
## save results
res_ranges <- merged$region
elementMetadata(res_ranges) <-
data.frame(tab_comb,
best_pos = mid(ranges(rowRanges(filtered_cnts[tab_best$best]))),
best_logFC = tab_best$logFC)
## get overall mean counts for each window
merged_cnts <- summarizeOverlaps(res_ranges, BamFileList(meta$sortedfile))
res_ranges$meancnt <- rowMeans(assays(merged_cnts)$counts)
saveRDS(res_ranges, file = result_file)
}
## covert to df for downstream analysis
res_df <- as.data.frame(res_ranges)
## add random covariate
set.seed(7778)
res_df$uninf_covar = rnorm(nrow(res_df))
rank_scatter(res_df, pvalue = "PValue", covariate = "nWindows") +
ggtitle("Number of windows as independent covariate") +
xlab("Number of Windows")
strat_hist(res_df, pvalue = "PValue", covariate = "nWindows", maxy = 35)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:rlang':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
rank_scatter(res_df, pvalue = "PValue", covariate = "width") +
ggtitle("Region width as independent covariate") +
xlab("Region Width")
strat_hist(res_df, pvalue = "PValue", covariate = "width", maxy = 35)
rank_scatter(res_df, pvalue = "PValue", covariate = "meancnt") +
ggtitle("Mean coverage as independent covariate") +
xlab("Mean coverage")
strat_hist(res_df, pvalue = "PValue", covariate = "meancnt", maxy = 35)
rank_scatter(res_df, pvalue = "PValue", covariate = "uninf_covar") +
ggtitle("Random independent covariate") +
xlab("Region Width")
strat_hist(res_df, pvalue = "PValue", covariate = "uninf_covar", maxy = 35)
We use the common BenchDesign
with the set of multiple testing correction methods already included. We investigate both the width of the regions and their mean coverage as the independent covariate. We won’t assess the number of windows, since this is very tightly correlated with the width of the region.
cor(res_df[,c("width", "nWindows", "meancnt")])
## width nWindows meancnt
## width 1.0000000 0.9983152 0.7328940
## nWindows 0.9983152 1.0000000 0.7412371
## meancnt 0.7328940 0.7412371 1.0000000
First, we’ll use the region width covariate.
if (!file.exists(bench_file)) {
res_df$ind_covariate <- res_df$width
res_df$pval <- res_df$PValue
bd <- initializeBenchDesign()
sb <- buildBench(bd, data = res_df, ftCols = "ind_covariate")
saveRDS(sb, file = bench_file)
} else {
sb <- readRDS(bench_file)
}
Next, we’ll use the mean coverage covariate.
if (!file.exists(bench_file_cov)) {
res_df$ind_covariate <- res_df$meancnt
res_df$pval <- res_df$PValue
bd <- initializeBenchDesign()
sbC <- buildBench(bd, data = res_df, ftCols = "ind_covariate")
saveRDS(sbC, file = bench_file_cov)
} else {
sbC <- readRDS(bench_file_cov)
}
We’ll also compare to the random covariate.
if (!file.exists(bench_file_uninf)) {
res_df$ind_covariate <- res_df$uninf_covar
res_df$pval <- res_df$PValue
bd <- initializeBenchDesign()
sbU <- buildBench(bd, data = res_df, ftCols = "ind_covariate")
saveRDS(sbU, file = bench_file_uninf)
} else {
sbU <- readRDS(bench_file_uninf)
}
Next, we’ll add the default performance metric for q-value assays. First, we have to rename the assay to ‘qvalue’.
assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
Now, we’ll plot the results.
rejections_scatter(sb, supplementary = FALSE)
rejection_scatter_bins(sb, covariate = "ind_covariate",
bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
order.by = "freq", decreasing = TRUE,
supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
## Warning: Removed 25 rows containing missing values (geom_path).
We’ll do the same for the mean coverage covariate.
assayNames(sbC) <- "qvalue"
sbC <- addDefaultMetrics(sbC)
Now, we’ll plot the results.
rejections_scatter(sbC, supplementary = FALSE)
rejection_scatter_bins(sbC, covariate = "ind_covariate",
bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sbC, alpha = 0.05, nsets = ncol(sbC),
order.by = "freq", decreasing = TRUE,
supplementary = FALSE)
covariateLinePlot(sbC, alpha = 0.05, covname = "ind_covariate")
## Warning: Removed 25 rows containing missing values (geom_path).
We’ll do the same for the random (uninformative covariate).
assayNames(sbU) <- "qvalue"
sbU <- addDefaultMetrics(sbU)
Now, we’ll plot the results.
rejections_scatter(sbU, supplementary = FALSE)
rejection_scatter_bins(sbU, covariate = "ind_covariate",
bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sbU, alpha = 0.05, nsets = ncol(sbU),
order.by = "freq", decreasing = TRUE,
supplementary = FALSE)
covariateLinePlot(sbU, alpha = 0.05, covname = "ind_covariate")
## Warning: Removed 25 rows containing missing values (geom_path).
Here we compare the method ranks for the two covariates at alpha = 0.10.
plotMethodRanks(c(bench_file, bench_file_cov, bench_file_uninf),
colLabels = c("Region width", "Mean Coverage", "Random"),
alpha = 0.10, xlab = "Covariate",
excludeMethods = NULL)
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 magrittr_1.5
## [3] cowplot_0.9.2 hexbin_1.27.2
## [5] GenomicAlignments_1.16.0 edgeR_3.22.3
## [7] limma_3.36.2 csaw_1.14.1
## [9] Rsubread_1.30.3 Rsamtools_1.32.0
## [11] Biostrings_2.48.0 XVector_0.20.0
## [13] rtracklayer_1.40.3 SummarizedBenchmark_0.99.1
## [15] mclust_5.4 stringr_1.3.1
## [17] rlang_0.2.1 UpSetR_1.3.3
## [19] SummarizedExperiment_1.10.1 DelayedArray_0.6.1
## [21] BiocParallel_1.14.2 matrixStats_0.53.1
## [23] Biobase_2.40.0 GenomicRanges_1.32.3
## [25] GenomeInfoDb_1.16.0 IRanges_2.14.10
## [27] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [29] tidyr_0.8.1 ggplot2_3.0.0
## [31] dplyr_0.7.5
##
## loaded via a namespace (and not attached):
## [1] httr_1.3.1 bit64_0.9-7 assertthat_0.2.0
## [4] blob_1.1.1 GenomeInfoDbData_1.1.0 yaml_2.1.19
## [7] progress_1.1.2 pillar_1.2.3 RSQLite_2.1.1
## [10] backports_1.1.2 lattice_0.20-35 glue_1.2.0
## [13] digest_0.6.15 RColorBrewer_1.1-2 colorspace_1.3-2
## [16] htmltools_0.3.6 Matrix_1.2-14 plyr_1.8.4
## [19] XML_3.98-1.11 pkgconfig_2.0.1 biomaRt_2.36.1
## [22] zlibbioc_1.26.0 purrr_0.2.5 scales_0.5.0
## [25] tibble_1.4.2 withr_2.1.2 GenomicFeatures_1.32.0
## [28] lazyeval_0.2.1 memoise_1.1.0 evaluate_0.10.1
## [31] prettyunits_1.0.2 tools_3.5.0 Rhtslib_1.12.1
## [34] munsell_0.4.3 locfit_1.5-9.1 AnnotationDbi_1.42.1
## [37] compiler_3.5.0 grid_3.5.0 RCurl_1.95-4.10
## [40] labeling_0.3 bitops_1.0-6 rmarkdown_1.10
## [43] gtable_0.2.0 DBI_1.0.0 R6_2.2.2
## [46] gridExtra_2.3 knitr_1.20 bit_1.1-14
## [49] bindr_0.1.1 rprojroot_1.3-2 stringi_1.2.2
## [52] Rcpp_0.12.17 tidyselect_0.2.4