1 Summary

In this set of simulations, we consider settings with both null and non-null tests with an informative and uninformative covariate. The informative covariate is sampled uniformly from the interval [0, 1], and the conditional probability of a test being non-null is a smooth (cubic) function of the covariate. The uninformative covariate is simply uninformly sampled independently from the interval [0, 1]. The uninformative covariate is included as a baseline to compare the informative covariate against. We include simulation results again with Gaussian distributed test statistics.

Additionally, with the informative covariate, a dependence is introduced between the coviarate the p-value under the null distribution.

2 Workspace Setup

library(dplyr)
library(ggplot2)
library(parallel)
library(cowplot)

## load helper functions
for (f in list.files("../../R", "\\.(r|R)$", full.names = TRUE)) {
    source(f)
}

## new helper function
source("helpers-simulations-nulldependent.R")

## project data/results folders
resdir <- "results"
dir.create(resdir, showWarnings = FALSE, recursive = TRUE)

## intermediary files we create below
weak_file <- file.path(resdir, "nulldependent-cubic-benchmark-weaker.rds")
strong_file <- file.path(resdir, "nulldependent-cubic-benchmark-stronger.rds")

## number of cores for parallelization
cores <- 20
B <- 100

## define bechmarking design
bd <- initializeBenchDesign()

As described in simulations-null.Rmd, we include Scott’s FDR Regression in the analysis since all settings here use Gaussian test statistics. Again, we include both nulltype = "empirical" and nulltype = "theoretical".

bdplus <- bd
bdplus <- addBMethod(bdplus, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))
bdplus <- addBMethod(bdplus, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

All simulation settings will share the following parameters.

m <- 20000                        # integer: number of hypothesis tests
pi0 <- pi0_cubic(0.90)            # numeric: proportion of null hypotheses
icovariate <- runif               # functional: independent covariate

es_dist <- rnorm_generator(3)     # functional: dist of alternative test stats
ts_dist <- rnorm_perturber(1)     # functional: sampling dist/noise for test stats
null_dist <- rnorm_2pvaluer(1)    # functional: dist to calc p-values
null_inv <- function(p) { (2*rbinom(length(p), 1, .5) - 1) * qnorm(p / 2, 0, 1) }
seed <- 608

Simulation results will be presented excluding a subset of methods, and for certain plots (upset plots), a single alpha cutoff will be used.

excludeSet <- c("unadjusted", "bl-df02", "bl-df04", "bl-df05")
ualpha <- 0.05

3 Null Dependence

Here, we show null dependence on the variate by just showing the distribution of p-values when all tests are null (pi0 = 1). We show the distribution of p-values for two settings: weaker and stronger dependence between the covariate and p-values under the null.

3.1 Weaker Dependence

onerun_w <- simNullDependent(bdplus, m = 20000, pi0 = 1, es_dist = es_dist, ts_dist = ts_dist,
                             icovariate = icovariate, null_dist = null_dist, null_inv = null_inv,
                             null_dependence = 0.3, execute = FALSE)

rank_scatter(onerun_w, pvalue = "pval", covariate = "ind_covariate") +
    ggtitle("Dist. of null p-values w/ weaker dependence")

strat_hist(onerun_w, pvalue = "pval", covariate = "ind_covariate", maxy = 7, numQ = 5) +
    ggtitle("Dist. of null p-values w/ weaker dependence")

## Warning: `list_len()` is soft-deprecated as of rlang 0.2.0.
## Please use `new_list()` instead
## This warning is displayed once per session.

3.2 Stronger Dependence

onerun_s <- simNullDependent(bdplus, m = 20000, pi0 = 1, es_dist = es_dist, ts_dist = ts_dist,
                             icovariate = icovariate, null_dist = null_dist, null_inv = null_inv,
                             null_dependence = 0.7, execute = FALSE)

rank_scatter(onerun_s, pvalue = "pval", covariate = "ind_covariate") +
    ggtitle("Dist. of null p-values w/ stronger dependence")

strat_hist(onerun_s, pvalue = "pval", covariate = "ind_covariate", maxy = 7, numQ = 5) +
    ggtitle("Dist. of null p-values w/ stronger dependence")

3.3 Multipanel Plot

We also create the above plots as a multipanel plot.

gp_oW <- rank_scatter(onerun_w, pvalue = "pval", covariate = "ind_covariate") +
    ggtitle("Dist. of null p-values w/ weaker dependence")
gp_oS <- rank_scatter(onerun_s, pvalue = "pval", covariate = "ind_covariate") +
    ggtitle("Dist. of null p-values w/ stronger dependence")

gp_o <- plot_grid(gp_oW + expand_limits(y = c(0, 5)),
                  gp_oS + expand_limits(y = c(0, 5)),
                  labels = LETTERS[1:2], nrow = 1)
gp_o

We next run the simulations.

4 Weaker Dependence

First, we take a look at the results with weaker dependence.

if (file.exists(weak_file)) {
    res <- readRDS(weak_file)
} else {
    res <- mclapply(X = 1:B, FUN = simNullDependent, bench = bdplus, m = m,
                    pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
                    ts_dist = ts_dist, null_dist = null_dist, null_inv = null_inv,
                    null_dependence = 0.3,
                    seed = seed, mc.cores = cores)
    saveRDS(res, file = weak_file)
}

res_dep <- lapply(res, `[[`, "null_dependent")
res_indep <- lapply(res, `[[`, "null_independent")

4.1 Covariate Diagnostics

Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment. Unlike the plots above, these include both null and non-null tests.

onerun <- simNullDependent(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist,
                           icovariate = icovariate, null_dist = null_dist, null_inv = null_inv,
                           null_dependence = 0.3, execute = FALSE)

rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")

strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 7, numQ = 5)

4.2 Benchmark Metrics

We plot the averaged results across 100 replications.

resdf <- plotsim_standardize(res_dep, alpha = seq(0.01, 0.10, 0.01))

plotsim_average(resdf, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

plotsim_average(resdf, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE)

We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.

covariateLinePlot(res_dep, alpha = ualpha, covname = "effect_size")

## Warning: Removed 50 rows containing missing values (geom_path).

## Warning: Removed 50 rows containing missing values (geom_errorbar).

covariateLinePlot(res_dep, alpha = ualpha, covname = "ind_covariate")

## Warning: Removed 50 rows containing missing values (geom_path).

## Warning: Removed 50 rows containing missing values (geom_errorbar).

We also look at the FDR as a function of the independent covariate.

covariateLinePlot(res_dep, alpha = ualpha, covname = "ind_covariate", metric = "FDR")

## Warning: Removed 61 rows containing missing values (geom_path).

## Warning: Removed 70 rows containing missing values (geom_errorbar).

Finally, (if enough methods produce rejections at 0.05) we take a look at the overlap of rejections between methods.

if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) {
    aggupset(res_dep, alpha = ualpha, supplementary = FALSE, return_list = FALSE)
} else {
    message("Not enough methods found rejections at alpha ", ualpha, 
            "; skipping upset plot")
}

We also compare the simulation results with and without dependence between the covariate under the nul.

resdfu <- plotsim_standardize(res_indep, alpha = seq(0.01, 0.10, 0.01))

resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            select(resdfu, rep, blabel, param.alpha, key,
                                   performanceMetric, alpha, value),
                            by = c("rep", "blabel", "param.alpha", "key",
                                   "performanceMetric", "alpha"),
                            suffix = c(".dep", ".indep"))
resdfiu <- dplyr::mutate(resdfiu, value = value.dep - value.indep)

plotsim_average(resdfiu, met="rejections", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) +
    ylab("rejections (dep - indep)")

plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) +
    scale_y_continuous("FDR (dep - indep)", labels = scales::percent)

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) +
    scale_y_continuous("TPR (dep - indep)", labels = scales::percent)

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

plotsim_average(resdfiu, met="TNR", filter_set = excludeSet,
                merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) +
    scale_y_continuous("TNR (dep - indep)", labels = scales::percent)

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

4.3 Multipanel Plot

Here, we create a multipanel plot to consolidate the results to a single figure.

gp_wA <- plotsim_average(resdf, met="FDR", filter_set = excludeSet,
                         merge_ihw = TRUE, errorBars = TRUE) 
gp_wB <- plotsim_average(resdf, met="TPR", filter_set = excludeSet,
                         merge_ihw = TRUE, errorBars = TRUE) 
gp_wC <- plotsim_average(resdfiu, met="FDR", filter_set = excludeSet,
                         merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) +
    scale_y_continuous("FDR (dep - indep)", labels = scales::percent)

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

gp_wD <- plotsim_average(resdfiu, met="TPR", filter_set = excludeSet,
                         merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) +
    scale_y_continuous("TPR (dep - indep)", labels = scales::percent)

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

gp_w <- plot_grid(gp_wA + theme(legend.position = "none"),
                  gp_wB + theme(legend.position = "none"),
                  gp_wC + theme(legend.position = "none"),
                  gp_wD + theme(legend.position = "none"),
                  labels = LETTERS[1:4], nrow = 2, ncol = 2)
gp_w <- plot_grid(gp_w, get_legend(gp_wA),
                  rel_widths = c(1, .2))
gp_w

5 Stronger Dependence

Next, we take a look at the results with stronger dependence.

if (file.exists(strong_file)) {
    res <- readRDS(strong_file)
} else {
    res <- mclapply(X = 1:B, FUN = simNullDependent, bench = bdplus, m = m,
                    pi0 = pi0, es_dist = es_dist, icovariate = icovariate,
                    ts_dist = ts_dist, null_dist = null_dist, null_inv = null_inv,
                    null_dependence = 0.7,
                    seed = seed, mc.cores = cores)
    saveRDS(res, file = strong_file)
}

res_dep <- lapply(res, `[[`, "null_dependent")
res_indep <- lapply(res, `[[`, "null_independent")

5.1 Covariate Diagnostics