1 Summary

In this case study, we perform differential peak calling on ChIP-seq data for a trnscription factor, CREB-binding protein (CBP), from Kasper et al. (2014), used in Lun et al. (2015) to demonstrate the usage of csaw. As described in a separate case study analyzing ChIP-seq data for H3K4Me3, csaw is a package for differential peak calling based on counting reads in sliding windows across the genome. Code and steps for initial processing and analysis of this data with csaw are taken directly from Lun et al. (2015).

2 Workspace Setup

library(dplyr)
library(ggplot2)
library(SummarizedBenchmark)
library(BiocParallel)
library(rtracklayer)
library(Rsamtools)
library(Rsubread)
library(csaw)
library(edgeR)
library(GenomicAlignments)

## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
    source(f)
}

## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/ChIPseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)

## intermediary files we create below
count_file <- file.path(resdir, "cbp-csaw-counts.rds")
filtered_file <- file.path(resdir, "cbp-csaw-counts-filtered.rds")
normfacs_file <- file.path(resdir, "cbp-csaw-counts-normfacs.rds")
result_file <- file.path(resdir, "cbp-csaw-results.rds")
bench_file <- file.path(sbdir, "cbp-csaw-benchmark.rds")
bench_file_cov <- file.path(sbdir, "cbp-csaw-cov-benchmark.rds")
bench_file_uninf <- file.path(sbdir, "cbp-csaw-uninf-benchmark.rds")

## set up parallel backend
cores <- as.numeric(Sys.getenv("SLURM_NTASKS"))
multicoreParam <- MulticoreParam(workers = cores)

3 Data Preparation

We download the fastq files directly from the European Nucleotide Archive (ENA).

fqurls <- c("007/SRR1145787/SRR1145787.fastq.gz",
            "008/SRR1145788/SRR1145788.fastq.gz",
            "009/SRR1145789/SRR1145789.fastq.gz",
            "000/SRR1145790/SRR1145790.fastq.gz")
fqurls <- paste0("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR114/", fqurls)

for (i_fq in fqurls) {
    out_fq <- file.path(datdir, basename(i_fq))
    if (!file.exists(out_fq)) {
        download.file(i_fq, destfile = out_fq)
    }
}

We download the mm10 reference genome used for ENCODE3 and build the index for alignment if not already available.

ref_url <- paste0("https://www.encodeproject.org/files/",
                  "mm10_no_alt_analysis_set_ENCODE/@@download/",
                  "mm10_no_alt_analysis_set_ENCODE.fasta.gz")
ref_fastagz <- file.path(datdir, basename(ref_url))
ref_fasta <- gsub("\\.gz$", "", ref_fastagz)

if (!file.exists(file.path(datdir, "mm_ref_index.00.b.tab"))) {
    if (!file.exists(ref_fasta)) {
        download.file(ref_url, destfile = ref_fastagz)
        system(paste("gunzip", ref_fastagz))
    }
    buildindex(basename = file.path(datdir, "mm_ref_index"),
               reference = ref_fasta)
}

Sample metadata is stored in a data.frame.

fqfiles <- file.path(datdir, basename(fqurls))
meta <- data.frame(genotype = factor(c("wt", "wt", "ko", "ko")),
                   fqurl = fqurls, fqfile = fqfiles,
                   bamfile = gsub("\\.fastq\\.gz", ".bam", fqfiles),
                   sortedfile = gsub("\\.fastq\\.gz", ".sorted.bam", fqfiles),
                   stringsAsFactors = FALSE)

We also download blacklisted regions for mm10 from ENCODE (https://www.encodeproject.org/annotations/ENCSR636HFF/).

blacklist_url <- "https://www.encodeproject.org/files/ENCFF547MET/@@download/ENCFF547MET.bed.gz"
if (!file.exists(file.path(datdir, basename(blacklist_url)))) {
    download.file(blacklist_url, destfile = file.path(datdir, basename(blacklist_url)))
}

bl <- import(file.path(datdir, basename(blacklist_url)))

Standard set of parameters are defined for the analysis. Only the canonical set of chromosomes are used in the analysis.

std_chr <- paste0("chr", c(1:19, "X", "Y"))
param <- readParam(minq = 20, discard = bl, restrict = std_chr)

3.1 Read Counting

We count reads within sliding windows across the genome.

if (file.exists(count_file)) {
    win_cnts <- readRDS(count_file)
} else {
    unaligned <- !file.exists(meta$bamfile)
    if (any(unaligned)) {
        align(index = file.path(datdir, "mm_ref_index"),
              readfile1 = meta$fqfile[unaligned],
              type = 1, phredOffset = 64,
              input_format = "FASTQ",
              output_file = meta$bamfile[unaligned])
    }
    
    ## sort bam files
    for (i in 1:nrow(meta)) {
        if (!file.exists(meta$sortedfile[i])) {
            sortBam(meta$bamfile[i], gsub("\\.bam$", "", meta$sortedfile[i]))
        }
    }
    
    ## mark duplicates w/ picard and index bam files
    if (any(!file.exists(paste0(meta$sortedfile, ".bai")))) {
        temp_bam <- file.path(datdir, "temp_dups_mm.bam")
        temp_file <- file.path(datdir, "temp_mets_mm.txt")
        temp_dir <- file.path(datdir, "temp_dups_mm")
        dir.create(temp_dir)
        for (i_bam in meta$sortedfile) {
            code <- paste0("java -jar ${PICARD_TOOLS_HOME}/picard.jar ",
                           "MarkDuplicates I=%s O=%s M=%s ", 
                           "TMP_DIR=%s AS=true REMOVE_DUPLICATES=false ",
                           "VALIDATION_STRINGENCY=SILENT")
            code <- sprintf(code, i_bam, temp_bam, temp_file, temp_dir)
            code <- system(code)
            stopifnot(code == 0L)
            file.rename(temp_bam, i_bam)
        }
        unlink(temp_file)
        unlink(temp_dir, recursive = TRUE)
        
        indexBam(meta$sortedfile)
    }
    
    ## use correlateReads to determine fragment length (remove dups)
    x <- correlateReads(meta$sortedfile, param = reform(param, dedup = TRUE))
    frag_len <- which.max(x) - 1
    
    ## count reads in sliding windows (keep dups)
    win_cnts <- windowCounts(meta$sortedfile, param = param, width = 10, ext = frag_len)
                             
    ## save unfiltered counts
    saveRDS(win_cnts, file = count_file)
}

We can apply prefiltering on windows with low abundance as described in Lun and Smyth (2016).

if (file.exists(filtered_file)) {
    filtered_cnts <- readRDS(filtered_file)
    normfacs <- readRDS(normfacs_file)
} else {
    ## filter windows by abundance
    bins <- windowCounts(meta$sortedfile, bin = TRUE, width = 10000, param = param)
    filter_stat <- filterWindows(win_cnts, bins, type = "global")
    keep <- filter_stat$filter > log2(3)
    filtered_cnts <- win_cnts[keep, ]

    ## calculate normalizing offsets based on larger windows
    normfacs <- normOffsets(bins, se.out = FALSE)
    
    ## save filtered counts, normalizing factors
    saveRDS(filtered_cnts, file = filtered_file)
    saveRDS(normfacs, file = normfacs_file)
}

4 Data Analysis

4.1 Differential Testing

We use edgeR to test for differential binding with the filtered data.

if (file.exists(result_file)) {
    res_ranges <- readRDS(result_file)
} else {
    ## create model diesign
    design <- model.matrix(~ 0 + genotype, data = meta)
    colnames(design) <- levels(meta$genotype)

    ## run quasi-likelihood F-test
    y <- asDGEList(filtered_cnts, norm.factors = normfacs)
    y <- estimateDisp(y, design)
    fit <- glmQLFit(y, design, robust = TRUE)
    res <- glmQLFTest(fit, contrast = makeContrasts(wt-ko, levels = design))

    ## merge p-values across regions
    merged <- mergeWindows(rowRanges(filtered_cnts), tol = 100, max.width = 5000)
    tab_comb <- combineTests(merged$id, res$table)
    tab_best <- getBestTest(merged$id, res$table)

    ## save results
    res_ranges <- merged$region
    elementMetadata(res_ranges) <-
        data.frame(tab_comb,
                   best_pos = mid(ranges(rowRanges(filtered_cnts[tab_best$best]))),
                   best_logFC = tab_best$logFC)

    ## get overall mean counts for each window
    merged_cnts <- summarizeOverlaps(res_ranges, BamFileList(meta$sortedfile))
    res_ranges$meancnt <- rowMeans(assays(merged_cnts)$counts)
    
    saveRDS(res_ranges, file = result_file)
}

## covert to df for downstream analysis
res_df <- as.data.frame(res_ranges)

## add random covariate 
set.seed(11245)
res_df$uninf_covar = rnorm(nrow(res_df))

4.2 Covariate Diagnostics

4.2.1 Number of Windows

rank_scatter(res_df, pvalue = "PValue", covariate = "nWindows") +
    ggtitle("Number of windows as independent covariate") +
    xlab("Number of Windows")

strat_hist(res_df, pvalue = "PValue", covariate = "nWindows", maxy = 15)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:rlang':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

4.2.2 Region Width

rank_scatter(res_df, pvalue = "PValue", covariate = "width") +
    ggtitle("Region width as independent covariate") +
    xlab("Region Width")

strat_hist(res_df, pvalue = "PValue", covariate = "width", maxy = 15)

4.2.3 Mean Coverage

rank_scatter(res_df, pvalue = "PValue", covariate = "meancnt") +
    ggtitle("Mean coverage as independent covariate") +
    xlab("Mean coverage")

strat_hist(res_df, pvalue = "PValue", covariate = "meancnt", maxy = 15)

4.2.4 Random

rank_scatter(res_df, pvalue = "PValue", covariate = "uninf_covar") +
    ggtitle("Random independent covariate") +
    xlab("Region Width")

strat_hist(res_df, pvalue = "PValue", covariate = "uninf_covar", maxy = 15)

4.3 Multiple-Testing Correction

We use the common BenchDesign with the set of multiple testing correction methods already included. We investigate both the width of the regions and their mean coverage as the independent covariate. We won’t assess the number of windows, since this is very tightly correlated with the width of the region.

cor(res_df[,c("width", "nWindows", "meancnt")])

##              width  nWindows   meancnt
## width    1.0000000 0.9982230 0.1023338
## nWindows 0.9982230 1.0000000 0.1029068
## meancnt  0.1023338 0.1029068 1.0000000

First, we’ll use the region width covariate.

if (!file.exists(bench_file)) {
    res_df$ind_covariate <- res_df$width
    res_df$pval <- res_df$PValue

    bd <- initializeBenchDesign()
    sb <- buildBench(bd, data = res_df, ftCols = "ind_covariate")
    saveRDS(sb, file = bench_file)
} else {
    sb <- readRDS(bench_file)
}

Next, we’ll use the mean coverage covariate.

if (!file.exists(bench_file_cov)) {
    res_df$ind_covariate <- res_df$meancnt
    res_df$pval <- res_df$PValue

    bd <- initializeBenchDesign()
    sbC <- buildBench(bd, data = res_df, ftCols = "ind_covariate")
    saveRDS(sbC, file = bench_file_cov)
} else {
    sbC <- readRDS(bench_file_cov)
}

We’ll also compare to the random covariate.

if (!file.exists(bench_file_uninf)) {
    res_df$ind_covariate <- res_df$uninf_covar
    res_df$pval <- res_df$PValue

    bd <- initializeBenchDesign()
    sbU <- buildBench(bd, data = res_df, ftCols = "ind_covariate")
    saveRDS(sbU, file = bench_file_uninf)
} else {
    sbU <- readRDS(bench_file_uninf)
}

4.4 Benchmark Metrics

4.4.1 Region width

Next, we’ll add the default performance metric for q-value assays. First, we have to rename the assay to ‘qvalue’.

assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)

Now, we’ll plot the results.

rejections_scatter(sb, supplementary = FALSE)

rejection_scatter_bins(sb, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)

plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)

covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")

## Warning: Removed 25 rows containing missing values (geom_path).

4.4.2 Mean Coverage

We’ll do the same for the mean coverage covariate.

assayNames(sbC) <- "qvalue"
sbC <- addDefaultMetrics(sbC)

Now, we’ll plot the results.

rejections_scatter(sbC, supplementary = FALSE)

rejection_scatter_bins(sbC, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)

plotFDRMethodsOverlap(sbC, alpha = 0.05, nsets = ncol(sbC),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)

covariateLinePlot(sbC, alpha = 0.05, covname = "ind_covariate")

## Warning: Removed 25 rows containing missing values (geom_path).

4.4.3 Random

We’ll do the same for the random (uninformative covariate).

assayNames(sbU) <- "qvalue"
sbU <- addDefaultMetrics(sbU)

Now, we’ll plot the results.

rejections_scatter(sbU, supplementary = FALSE)

rejection_scatter_bins(sbU, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)

plotFDRMethodsOverlap(sbU, alpha = 0.05, nsets = ncol(sbU),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)

covariateLinePlot(sbU, alpha = 0.05, covname = "ind_covariate")

## Warning: Removed 25 rows containing missing values (geom_path).

4.5 Covariate comparison

Here we compare the method ranks for the two covariates at alpha = 0.10.

plotMethodRanks(c(bench_file, bench_file_cov, bench_file_uninf), 
                colLabels = c("Region width", "Mean Coverage", "Random"), 
                alpha = 0.10, xlab = "Covariate", 
                excludeMethods = NULL)

5 Session Info

sessionInfo()

## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2              magrittr_1.5               
##  [3] cowplot_0.9.2               hexbin_1.27.2              
##  [5] GenomicAlignments_1.16.0    edgeR_3.22.3               
##  [7] limma_3.36.2                csaw_1.14.1                
##  [9] Rsubread_1.30.3             Rsamtools_1.32.0           
## [11] Biostrings_2.48.0           XVector_0.20.0             
## [13] rtracklayer_1.40.3          SummarizedBenchmark_0.99.1 
## [15] mclust_5.4                  stringr_1.3.1              
## [17] rlang_0.2.1                 UpSetR_1.3.3               
## [19] SummarizedExperiment_1.10.1 DelayedArray_0.6.1         
## [21] BiocParallel_1.14.2         matrixStats_0.53.1         
## [23] Biobase_2.40.0              GenomicRanges_1.32.3       
## [25] GenomeInfoDb_1.16.0         IRanges_2.14.10            
## [27] S4Vectors_0.18.3            BiocGenerics_0.26.0        
## [29] tidyr_0.8.1                 ggplot2_3.0.0              
## [31] dplyr_0.7.5                
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.3.1             bit64_0.9-7            assertthat_0.2.0      
##  [4] blob_1.1.1             GenomeInfoDbData_1.1.0 yaml_2.1.19           
##  [7] progress_1.1.2         pillar_1.2.3           RSQLite_2.1.1         
## [10] backports_1.1.2        lattice_0.20-35        glue_1.2.0            
## [13] digest_0.6.15          RColorBrewer_1.1-2     colorspace_1.3-2      
## [16] htmltools_0.3.6        Matrix_1.2-14          plyr_1.8.4            
## [19] XML_3.98-1.11          pkgconfig_2.0.1        biomaRt_2.36.1        
## [22] zlibbioc_1.26.0        purrr_0.2.5            scales_0.5.0          
## [25] tibble_1.4.2           withr_2.1.2            GenomicFeatures_1.32.0
## [28] lazyeval_0.2.1         memoise_1.1.0          evaluate_0.10.1       
## [31] prettyunits_1.0.2      tools_3.5.0            Rhtslib_1.12.1        
## [34] munsell_0.4.3          locfit_1.5-9.1         AnnotationDbi_1.42.1  
## [37] compiler_3.5.0         grid_3.5.0             RCurl_1.95-4.10       
## [40] labeling_0.3           bitops_1.0-6           rmarkdown_1.10        
## [43] gtable_0.2.0           DBI_1.0.0              R6_2.2.2              
## [46] gridExtra_2.3          knitr_1.20             bit_1.1-14            
## [49] bindr_0.1.1            rprojroot_1.3-2        stringi_1.2.2         
## [52] Rcpp_0.12.17           tidyselect_0.2.4

Case Study: ChIP-seq Differential Peak Calling (csaw)

Patrick Kimes

October 30, 2018

1 Summary

2 Workspace Setup

3 Data Preparation

3.1 Read Counting

4 Data Analysis

4.1 Differential Testing

4.2 Covariate Diagnostics

4.2.1 Number of Windows

4.2.2 Region Width

4.2.3 Mean Coverage

4.2.4 Random

4.3 Multiple-Testing Correction

4.4 Benchmark Metrics

4.4.1 Region width

4.4.2 Mean Coverage

4.4.3 Random

4.5 Covariate comparison

5 Session Info