The objective of this document is to compare different methods for multiple testing correction in the context of Gene Set Enrichment Analysis (GSEA). We use RNA-seq data consisting of cortex and cerebellum samples of a subset of the GTEx individuals. Shortly, samples were downloaded using the Short Read Archive Toolkit and mapped to the human reference genome version GRCh38 using STAR v2.4.2a. htseq-count was used to tabulate the number of uniquely mapping reads for each gene. We used the DESeq2
package to format the data into a DESeqDataSet
object.
Here we use the goseq
Bioconductor package to implement the gene set analysis. This is an Over-Representation Analysis which does requires setting an arbitrary threshold for Differential Expression. The test concerns whether any of the GO sets are enriched for DE genes.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:dplyr':
##
## exprs
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply
library(EnsDb.Hsapiens.v86)
## Loading required package: ensembldb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
##
## filter
## The following object is masked from 'package:stats':
##
## filter
library(biomaRt)
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
##
## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
source(f)
}
## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/GSEA"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)
## intermediary files we create below
count_file <- file.path(datdir, "human-counts.rds")
deseq_file <- file.path(datdir, "human-deseq.rds")
goset_file <- file.path(datdir, "human-gosets.rds")
result_file <- file.path(resdir, "human-results-goseq.rds")
We download the DESeqDataSet from zenodo, if not present locally, and modify the design to test for differences in gene expression between cerebellums and cortex.
if (!file.exists(count_file)) {
download.file("https://zenodo.org/record/1475409/files/gsea-human-counts.rds?download=1",
destfile = count_file)
}
dsdObject <- readRDS(count_file)
design( dsdObject ) <- ~ tissue
To keep running times short, we only perform differential tesing on protein coding genes, as specified in Ensembl release 86.
gnType <- genes(EnsDb.Hsapiens.v86, column = "gene_biotype")
protGenes <- gnType$gene_id[gnType$gene_biotype == "protein_coding"]
dsdObject <- dsdObject[rownames(dsdObject) %in% protGenes, ]
Next, we run DESeq2 to retrieve a list of differentially expressed genes at a FDR of 10%.
if (!file.exists(deseq_file)) {
dsdObject <- DESeq(dsdObject)
res <- results(dsdObject, independentFiltering = FALSE)
saveRDS(res, file = deseq_file)
} else {
res <- readRDS(deseq_file)
}
genes <- as.numeric(res$padj < 0.1)
names(genes) <- rownames(res)
sum(genes, na.rm=TRUE)
## [1] 9853
Next we’ll use the biomaRt package to download and associate GO annotations for each gene.
if (!file.exists(goset_file)) {
mart <- useMart("ensembl", "hsapiens_gene_ensembl")
goSets <- getBM(c("ensembl_gene_id", "go_id"), mart = mart,
filters = "ensembl_gene_id", values = rownames(res))
goSets <- goSets[!nchar( goSets$go_id ) == 0, ]
goSets <- with(goSets, split(go_id, ensembl_gene_id))
saveRDS(goSets, file = goset_file)
} else {
goSets <- readRDS(goset_file)
}
Finally, we’ll perform a gene set enrichment analysis with goseq. goseq performs GSEA and adjusts for gene length biases. The bias is represented in the plot below, with gene length bin on the x-axis and proportion of genes detected to be differentially expressed on the y-axis.
if (!file.exists(result_file)) {
## get median transcript length for bias adjustment
txByGene <- transcriptsBy(EnsDb.Hsapiens.v86, "gene")
geneLength <- sapply(width(txByGene), median)
geneLength <- geneLength[names(genes)]
genes[is.na(genes)] <- 0
## perform gsea
pwf <- nullp(genes, bias.data = geneLength)
goRes <- goseq(pwf, gene2cat = goSets)
saveRDS(goRes, file = result_file)
} else {
goRes <- readRDS(result_file)
}
## Add random (uninformative) covariate
set.seed(66778)
goRes$rand_covar <- rnorm(nrow(goRes))
We filter out really small gene sets and those gene sets which have no DE genes in them (many of these are set to 1).
goRes <- goRes %>%
dplyr::filter(numDEInCat > 0, numInCat > 5 )
Here, we want to check whether the size of the gene set is actually informative and independent under the null.
We will explore whether the size of the gene set can be used as a covariate for modern multiple-testing correction methods in the context of GSEA. In the plot below, the log10 of the p-values is plotted as a function of the size of the gene set. There is a pattern in which gene sets with a higher number of genes tend to have smaller p-values, which is indicative that gene set size is an informative covariate.
rank_scatter(dat = goRes, pval = "over_represented_pvalue",
covariate = "numInCat", bins = 50, funx = log2,
funfill = log10_trans()) +
ylim(0, 12) +
ggtitle("Over-represented gene sets") +
xlab(expression(log[10]~"(# of genes)")) +
ylab(expression(-log[10]~"(p-value)") )
## Warning: Removed 4 rows containing non-finite values (stat_binhex).
## Warning: Removed 40 rows containing missing values (geom_hex).
But unfortunately the distribution p-values is quite different for the different covariate strata. This is not surprising, as we expect that smaller gene sets that have at least one DE gene in them will be skewed toward smaller p-values (as compared to larger gene sets that have at least one DE gene), simply because a single DE gene represents a larger proportion in the smaller sets. If we don’t condition on sets that have at least one DE gene, however, the distribution of p-values is spiked at 1 since goseq assigns all such sets to a p-value of 1.
strat_hist(goRes, pval="over_represented_pvalue", covariate="numInCat", maxy=7)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
This suggests that the covariate is not independent under the null hypothesis, so the assumptions of many of the methods (which use an independent covariate) are not satisfied. Therefore we will not proceed with the benchmarking using these results.
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] magrittr_1.5 cowplot_0.9.2
## [3] hexbin_1.27.2 bindrcpp_0.2.2
## [5] goseq_1.32.0 geneLenDataBase_1.16.0
## [7] BiasedUrn_1.07 biomaRt_2.36.1
## [9] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.4.1
## [11] AnnotationFilter_1.4.0 GenomicFeatures_1.32.0
## [13] AnnotationDbi_1.42.1 DESeq2_1.20.0
## [15] SummarizedExperiment_1.10.1 DelayedArray_0.6.1
## [17] BiocParallel_1.14.2 matrixStats_0.53.1
## [19] Biobase_2.40.0 GenomicRanges_1.32.3
## [21] GenomeInfoDb_1.16.0 IRanges_2.14.10
## [23] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [25] scales_0.5.0 ggplot2_3.0.0
## [27] dplyr_0.7.5
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 ProtGenerics_1.12.0
## [3] bitops_1.0-6 bit64_0.9-7
## [5] RColorBrewer_1.1-2 progress_1.1.2
## [7] httr_1.3.1 rprojroot_1.3-2
## [9] tools_3.5.0 backports_1.1.2
## [11] R6_2.2.2 rpart_4.1-13
## [13] mgcv_1.8-23 Hmisc_4.1-1
## [15] DBI_1.0.0 lazyeval_0.2.1
## [17] colorspace_1.3-2 nnet_7.3-12
## [19] withr_2.1.2 tidyselect_0.2.4
## [21] gridExtra_2.3 prettyunits_1.0.2
## [23] bit_1.1-14 curl_3.2
## [25] compiler_3.5.0 htmlTable_1.12
## [27] labeling_0.3 rtracklayer_1.40.3
## [29] checkmate_1.8.5 genefilter_1.62.0
## [31] stringr_1.3.1 digest_0.6.15
## [33] Rsamtools_1.32.0 foreign_0.8-70
## [35] rmarkdown_1.10 XVector_0.20.0
## [37] base64enc_0.1-3 pkgconfig_2.0.1
## [39] htmltools_0.3.6 htmlwidgets_1.2
## [41] rlang_0.2.1 rstudioapi_0.7
## [43] RSQLite_2.1.1 bindr_0.1.1
## [45] acepack_1.4.1 RCurl_1.95-4.10
## [47] GO.db_3.6.0 GenomeInfoDbData_1.1.0
## [49] Formula_1.2-3 Matrix_1.2-14
## [51] Rcpp_0.12.17 munsell_0.4.3
## [53] stringi_1.2.2 yaml_2.1.19
## [55] zlibbioc_1.26.0 plyr_1.8.4
## [57] grid_3.5.0 blob_1.1.1
## [59] lattice_0.20-35 Biostrings_2.48.0
## [61] splines_3.5.0 annotate_1.58.0
## [63] locfit_1.5-9.1 knitr_1.20
## [65] pillar_1.2.3 geneplotter_1.58.0
## [67] XML_3.98-1.11 glue_1.2.0
## [69] evaluate_0.10.1 latticeExtra_0.6-28
## [71] data.table_1.11.4 gtable_0.2.0
## [73] purrr_0.2.5 assertthat_0.2.0
## [75] xtable_1.8-2 survival_2.41-3
## [77] tibble_1.4.2 GenomicAlignments_1.16.0
## [79] memoise_1.1.0 cluster_2.0.7-1