In this set of simulations, we consider settings with both null and non-null tests with varying distribution of effect sizes under the non-null (alternative) setting. Both informative and uninformative covariates are included in the setting as described in simulations-informative-cubic.Rmd
. The effect sizes for non-null tests are sampled from unimodal distributions composed of a mixture of normal distributions, as described in the original adaptive shrinkage (ASH) manuscript (Stephens, 2016).
library(dplyr)
library(ggplot2)
library(SummarizedBenchmark)
library(parallel)
## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
source(f)
}
## project data/results folders
resdir <- "results"
dir.create(resdir, showWarnings = FALSE, recursive = TRUE)
## intermediary files we create below
spiky_file <- file.path(resdir, "uasettings-benchmark-spiky.rds")
flattop_file <- file.path(resdir, "uasettings-benchmark-flattop.rds")
skew_file <- file.path(resdir, "uasettings-benchmark-skew.rds")
bimodal_file <- file.path(resdir, "uasettings-benchmark-bimodal.rds")
## number of cores for parallelization
cores <- 20
B <- 100
## define bechmarking design
bd <- initializeBenchDesign()
As described in simulations-null.Rmd
, we include Scott’s FDR Regression in the analysis for simulations with Gaussian or t-distributed test statistics. Again, we include both nulltype = "empirical"
and nulltype = "theoretical"
. Since all settings in this series of simulations use Gaussian test statistics, we include Scott’s FDR Regression in all of the comparisons.
bdplus <- bd
bdplus <- addBMethod(bdplus, "fdrreg-t",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'theoretical',
control = list(lambda = 0.01))
bdplus <- addBMethod(bdplus, "fdrreg-e",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'empirical',
control = list(lambda = 0.01))
All simulation settings will share the following parameters.
m <- 20000 # integer: number of hypothesis tests
pi0 <- pi0_cubic(0.90) # numeric: proportion of null hypotheses
ts_dist <- rnorm_perturber(1) # functional: sampling dist/noise for test stats
null_dist <- rnorm_2pvaluer(1) # functional: dist to calc p-values
icovariate <- runif # functional: independent covariate
Simulation results will be presented excluding a subset of methods, and for certain plots (upset plots), a single alpha cutoff will be used.
excludeSet <- c("unadjusted", "bl-df02", "bl-df04", "bl-df05")
ualpha <- 0.05
First, we consider the setting where the effect sizes under the alternative are distributed according to a “spiky” unimodal distribution centered around zero, as defined in the ASH simulations.
es_dist <- function(n) { 2*sampler_spiky(n) } # functional: dist of alternative test stats
seed <- 778
We next run the simulations.
if (file.exists(spiky_file)) {
res <- readRDS(spiky_file)
} else {
res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
ts_dist = ts_dist, null_dist = null_dist,
seed = seed, mc.cores = cores)
saveRDS(res, file = spiky_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
## Warning: Removed 9 rows containing non-finite values (stat_binhex).
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
We plot the averaged results across 100 replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(resdf, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")
covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha,
"; skipping upset plot")
}
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))
resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
select(resdfu, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
by = c("rep", "blabel", "param.alpha", "key",
"performanceMetric", "alpha"),
suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)
plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where the effect sizes under the alternative are distributed according to a “flat top” unimodal distribution centered around zero, as defined in the ASH simulations.
es_dist <- function(n) { 2*sampler_flat_top(n) } # functional: dist of alternative test stats
seed <- 980
We next run the simulations.
if (file.exists(flattop_file)) {
res <- readRDS(flattop_file)
} else {
res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
ts_dist = ts_dist, null_dist = null_dist,
seed = seed, mc.cores = cores)
saveRDS(res, file = flattop_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across 100 replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(resdf, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")
covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha,
"; skipping upset plot")
}
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))
resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
select(resdfu, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
by = c("rep", "blabel", "param.alpha", "key",
"performanceMetric", "alpha"),
suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)
plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where the effect sizes under the alternative are distributed according to a skewed unimodal distribution not centered at zero, as defined in the ASH simulations.
es_dist <- function(n) { 2*sampler_skew(n) } # functional: dist of alternative test stats
seed <- 206
We next run the simulations.
if (file.exists(skew_file)) {
res <- readRDS(skew_file)
} else {
res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
ts_dist = ts_dist, null_dist = null_dist,
seed = seed, mc.cores = cores)
saveRDS(res, file = skew_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
## Warning: Removed 99 rows containing non-finite values (stat_binhex).
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across 100 replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(resdf, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")
covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha,
"; skipping upset plot")
}
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))
resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
select(resdfu, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
by = c("rep", "blabel", "param.alpha", "key",
"performanceMetric", "alpha"),
suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)
plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Finally, we consider the setting where the effect sizes under the alternative are distributed according to a bimodal distribution (equal mixture of two normal distributions centered at -2, 2, with variance 1), again, as defined in the ASH simulations.
es_dist <- function(n) { 2*sampler_bimodal(n) } # functional: dist of alternative test stats
seed <- 913
We next run the simulations.
if (file.exists(bimodal_file)) {
res <- readRDS(bimodal_file)
} else {
res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
ts_dist = ts_dist, null_dist = null_dist,
seed = seed, mc.cores = cores)
saveRDS(res, file = bimodal_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
## Warning: Removed 53 rows containing non-finite values (stat_binhex).
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across 100 replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(resdf, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
plotsim_average(resdf, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")
covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha,
"; skipping upset plot")
}
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))
resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
select(resdfu, rep, blabel, param.alpha, key,
performanceMetric, alpha, value),
by = c("rep", "blabel", "param.alpha", "key",
"performanceMetric", "alpha"),
suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)
plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] splines parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 cowplot_0.9.3
## [3] hexbin_1.27.2 adaptMT_1.0.0
## [5] FDRreg_0.1 fda_2.4.8
## [7] Matrix_1.2-14 fdrtool_1.2.15
## [9] swfdr_1.6.0 qvalue_2.12.0
## [11] ashr_2.2-7 IHW_1.8.0
## [13] magrittr_1.5 SummarizedBenchmark_0.99.2
## [15] mclust_5.4.1 stringr_1.3.1
## [17] rlang_0.2.2 UpSetR_1.3.3
## [19] SummarizedExperiment_1.10.1 DelayedArray_0.6.6
## [21] BiocParallel_1.14.2 matrixStats_0.54.0
## [23] Biobase_2.40.0 GenomicRanges_1.32.7
## [25] GenomeInfoDb_1.16.0 IRanges_2.14.12
## [27] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [29] tidyr_0.8.1 ggplot2_3.0.0
## [31] dplyr_0.7.7
##
## loaded via a namespace (and not attached):
## [1] ggdendro_0.1-20 foreach_1.4.4 assertthat_0.2.0
## [4] ggstance_0.3.1 GenomeInfoDbData_1.1.0 ggrepel_0.8.0
## [7] mosaic_1.4.0 yaml_2.2.0 slam_0.1-43
## [10] pillar_1.3.0 backports_1.1.2 lattice_0.20-35
## [13] glue_1.3.0 digest_0.6.18 XVector_0.20.0
## [16] colorspace_1.3-2 htmltools_0.3.6 plyr_1.8.4
## [19] pkgconfig_2.0.2 broom_0.5.0 zlibbioc_1.26.0
## [22] purrr_0.2.5 scales_1.0.0 mosaicData_0.17.0
## [25] tibble_1.4.2 withr_2.1.2 lazyeval_0.2.1
## [28] crayon_1.3.4 evaluate_0.12 nlme_3.1-137
## [31] doParallel_1.0.14 MASS_7.3-51 truncnorm_1.0-8
## [34] tools_3.5.0 munsell_0.5.0 compiler_3.5.0
## [37] grid_3.5.0 RCurl_1.95-4.11 iterators_1.0.10
## [40] labeling_0.3 mosaicCore_0.6.0 bitops_1.0-6
## [43] rmarkdown_1.10 gtable_0.2.0 codetools_0.2-15
## [46] reshape2_1.4.3 R6_2.3.0 gridExtra_2.3
## [49] knitr_1.20 ggformula_0.9.0 bindr_0.1.1
## [52] rprojroot_1.3-2 lpsymphony_1.8.0 stringi_1.2.4
## [55] pscl_1.5.2 SQUAREM_2017.10-1 Rcpp_0.12.19
## [58] tidyselect_0.2.5