1 Summary

In this set of simulations, we consider settings with both null and non-null tests with varying distribution of effect sizes under the non-null (alternative) setting. Both informative and uninformative covariates are included in the setting as described in simulations-informative-cubic.Rmd. The effect sizes for non-null tests are sampled from unimodal distributions composed of a mixture of normal distributions, as described in the original adaptive shrinkage (ASH) manuscript (Stephens, 2016).

2 Workspace Setup

library(dplyr)
library(ggplot2)
library(SummarizedBenchmark)
library(parallel)

## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
    source(f)
}

## project data/results folders
resdir <- "results"
dir.create(resdir, showWarnings = FALSE, recursive = TRUE)

## intermediary files we create below
spiky_file <- file.path(resdir, "uasettings-benchmark-spiky.rds")
flattop_file <- file.path(resdir, "uasettings-benchmark-flattop.rds")
skew_file <- file.path(resdir, "uasettings-benchmark-skew.rds")
bimodal_file <- file.path(resdir, "uasettings-benchmark-bimodal.rds")

## number of cores for parallelization
cores <- 20
B <- 100

## define bechmarking design
bd <- initializeBenchDesign()

As described in simulations-null.Rmd, we include Scott’s FDR Regression in the analysis for simulations with Gaussian or t-distributed test statistics. Again, we include both nulltype = "empirical" and nulltype = "theoretical". Since all settings in this series of simulations use Gaussian test statistics, we include Scott’s FDR Regression in all of the comparisons.

bdplus <- bd
bdplus <- addBMethod(bdplus, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))
bdplus <- addBMethod(bdplus, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

All simulation settings will share the following parameters.

m <- 20000                          # integer: number of hypothesis tests
pi0 <- pi0_cubic(0.90)              # numeric: proportion of null hypotheses
ts_dist <- rnorm_perturber(1)       # functional: sampling dist/noise for test stats
null_dist <- rnorm_2pvaluer(1)      # functional: dist to calc p-values
icovariate <- runif                 # functional: independent covariate

Simulation results will be presented excluding a subset of methods, and for certain plots (upset plots), a single alpha cutoff will be used.

excludeSet <- c("unadjusted", "bl-df02", "bl-df04", "bl-df05")
ualpha <- 0.05

3 Spiky Setting

First, we consider the setting where the effect sizes under the alternative are distributed according to a “spiky” unimodal distribution centered around zero, as defined in the ASH simulations.

3.1 Data Simulation

es_dist <- function(n) { 2*sampler_spiky(n) }       # functional: dist of alternative test stats
seed <- 778

We next run the simulations.

if (file.exists(spiky_file)) {
    res <- readRDS(spiky_file)
} else {
    res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
                    pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
                    ts_dist = ts_dist, null_dist = null_dist,
                    seed = seed, mc.cores = cores)
    saveRDS(res, file = spiky_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")

3.2 Covariate Diagnostics

Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.

onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
                       icovariate = icovariate, null_dist = null_dist, execute = FALSE)

rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")

## Warning: Removed 9 rows containing non-finite values (stat_binhex).

strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

3.3 Benchmark Metrics

We plot the averaged results across 100 replications.

resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))

plotsim_average(resdf, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.

covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")

covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")

Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.

if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
    aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
    message("Not enough methods found rejections at alpha ", ualpha, 
            "; skipping upset plot")
}

We also compare the simulation results with and without an informative covariate.

resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))

resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            select(resdfu, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            by = c("rep", "blabel", "param.alpha", "key",
                                   "performanceMetric", "alpha"),
                            suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)

plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

4 Flat-Top Setting

Next, we consider the setting where the effect sizes under the alternative are distributed according to a “flat top” unimodal distribution centered around zero, as defined in the ASH simulations.

4.1 Data Simulation

es_dist <- function(n) { 2*sampler_flat_top(n) }       # functional: dist of alternative test stats
seed <- 980

We next run the simulations.

if (file.exists(flattop_file)) {
    res <- readRDS(flattop_file)
} else {
    res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
                    pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
                    ts_dist = ts_dist, null_dist = null_dist,
                    seed = seed, mc.cores = cores)
    saveRDS(res, file = flattop_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")

4.2 Covariate Diagnostics

Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.

onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
                       icovariate = icovariate, null_dist = null_dist, execute = FALSE)

rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")

strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)

4.3 Benchmark Metrics

We plot the averaged results across 100 replications.

resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))

plotsim_average(resdf, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.

covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")

covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")

Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.

if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
    aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
    message("Not enough methods found rejections at alpha ", ualpha, 
            "; skipping upset plot")
}

We also compare the simulation results with and without an informative covariate.

resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))

resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            select(resdfu, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            by = c("rep", "blabel", "param.alpha", "key",
                                   "performanceMetric", "alpha"),
                            suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)

plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

5 Skewed Setting

Next, we consider the setting where the effect sizes under the alternative are distributed according to a skewed unimodal distribution not centered at zero, as defined in the ASH simulations.

5.1 Data Simulation

es_dist <- function(n) { 2*sampler_skew(n) }       # functional: dist of alternative test stats
seed <- 206

We next run the simulations.

if (file.exists(skew_file)) {
    res <- readRDS(skew_file)
} else {
    res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
                    pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
                    ts_dist = ts_dist, null_dist = null_dist,
                    seed = seed, mc.cores = cores)
    saveRDS(res, file = skew_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")

5.2 Covariate Diagnostics

Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.

onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
                       icovariate = icovariate, null_dist = null_dist, execute = FALSE)

rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")

## Warning: Removed 99 rows containing non-finite values (stat_binhex).

strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)

5.3 Benchmark Metrics

We plot the averaged results across 100 replications.

resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))

plotsim_average(resdf, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.

covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")

covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")

Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.

if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
    aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
    message("Not enough methods found rejections at alpha ", ualpha, 
            "; skipping upset plot")
}

We also compare the simulation results with and without an informative covariate.

resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))

resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            select(resdfu, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            by = c("rep", "blabel", "param.alpha", "key",
                                   "performanceMetric", "alpha"),
                            suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)

plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

6 Bimodal Setting

Finally, we consider the setting where the effect sizes under the alternative are distributed according to a bimodal distribution (equal mixture of two normal distributions centered at -2, 2, with variance 1), again, as defined in the ASH simulations.

6.1 Data Simulation

es_dist <- function(n) { 2*sampler_bimodal(n) }        # functional: dist of alternative test stats
seed <- 913

We next run the simulations.

if (file.exists(bimodal_file)) {
    res <- readRDS(bimodal_file)
} else {
    res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m,
                    pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
                    ts_dist = ts_dist, null_dist = null_dist,
                    seed = seed, mc.cores = cores)
    saveRDS(res, file = bimodal_file)
}
res_i <- lapply(res, `[[`, "informative")
res_u <- lapply(res, `[[`, "uninformative")

6.2 Covariate Diagnostics

Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.

onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
                       icovariate = icovariate, null_dist = null_dist, execute = FALSE)

rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")

## Warning: Removed 53 rows containing non-finite values (stat_binhex).

strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)

6.3 Benchmark Metrics

We plot the averaged results across 100 replications.

resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01))

plotsim_average(resdf, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.

covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size")

covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")

Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.

if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
    aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
    message("Not enough methods found rejections at alpha ", ualpha, 
            "; skipping upset plot")
}

We also compare the simulation results with and without an informative covariate.

resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01))

resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            select(resdfu, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            by = c("rep", "blabel", "param.alpha", "key",
                                   "performanceMetric", "alpha"),
                            suffix = c(".info", ".uninfo"))
resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo)

plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)

7 Session Info

sessionInfo()

## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] splines   parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2              cowplot_0.9.3              
##  [3] hexbin_1.27.2               adaptMT_1.0.0              
##  [5] FDRreg_0.1                  fda_2.4.8                  
##  [7] Matrix_1.2-14               fdrtool_1.2.15             
##  [9] swfdr_1.6.0                 qvalue_2.12.0              
## [11] ashr_2.2-7                  IHW_1.8.0                  
## [13] magrittr_1.5                SummarizedBenchmark_0.99.2 
## [15] mclust_5.4.1                stringr_1.3.1              
## [17] rlang_0.2.2                 UpSetR_1.3.3               
## [19] SummarizedExperiment_1.10.1 DelayedArray_0.6.6         
## [21] BiocParallel_1.14.2         matrixStats_0.54.0         
## [23] Biobase_2.40.0              GenomicRanges_1.32.7       
## [25] GenomeInfoDb_1.16.0         IRanges_2.14.12            
## [27] S4Vectors_0.18.3            BiocGenerics_0.26.0        
## [29] tidyr_0.8.1                 ggplot2_3.0.0              
## [31] dplyr_0.7.7                
## 
## loaded via a namespace (and not attached):
##  [1] ggdendro_0.1-20        foreach_1.4.4          assertthat_0.2.0      
##  [4] ggstance_0.3.1         GenomeInfoDbData_1.1.0 ggrepel_0.8.0         
##  [7] mosaic_1.4.0           yaml_2.2.0             slam_0.1-43           
## [10] pillar_1.3.0           backports_1.1.2        lattice_0.20-35       
## [13] glue_1.3.0             digest_0.6.18          XVector_0.20.0        
## [16] colorspace_1.3-2       htmltools_0.3.6        plyr_1.8.4            
## [19] pkgconfig_2.0.2        broom_0.5.0            zlibbioc_1.26.0       
## [22] purrr_0.2.5            scales_1.0.0           mosaicData_0.17.0     
## [25] tibble_1.4.2           withr_2.1.2            lazyeval_0.2.1        
## [28] crayon_1.3.4           evaluate_0.12          nlme_3.1-137          
## [31] doParallel_1.0.14      MASS_7.3-51            truncnorm_1.0-8       
## [34] tools_3.5.0            munsell_0.5.0          compiler_3.5.0        
## [37] grid_3.5.0             RCurl_1.95-4.11        iterators_1.0.10      
## [40] labeling_0.3           mosaicCore_0.6.0       bitops_1.0-6          
## [43] rmarkdown_1.10         gtable_0.2.0           codetools_0.2-15      
## [46] reshape2_1.4.3         R6_2.3.0               gridExtra_2.3         
## [49] knitr_1.20             ggformula_0.9.0        bindr_0.1.1           
## [52] rprojroot_1.3-2        lpsymphony_1.8.0       stringi_1.2.4         
## [55] pscl_1.5.2             SQUAREM_2017.10-1      Rcpp_0.12.19          
## [58] tidyselect_0.2.5

Simulation Study: Unimodal Settings

Patrick Kimes

October 30, 2018

1 Summary

2 Workspace Setup

3 Spiky Setting

3.1 Data Simulation

3.2 Covariate Diagnostics

3.3 Benchmark Metrics

4 Flat-Top Setting

4.1 Data Simulation

4.2 Covariate Diagnostics

4.3 Benchmark Metrics

5 Skewed Setting

5.1 Data Simulation

5.2 Covariate Diagnostics

5.3 Benchmark Metrics

6 Bimodal Setting

6.1 Data Simulation

6.2 Covariate Diagnostics

6.3 Benchmark Metrics

7 Session Info