This is an analysis using polyester Bioconductor package to simulate RNA-Seq reads. We will
(1) implement null comparisons on samples simulated using polyester with no differentially expressed (DE) genes, (2) compare FDR approaches on their ability to discover ‘true positive’ DE genes with log2 fold changes simulated from a normal distribution.
In this Rmd, we will perform a Monte Carlo simulation study and average over the replicates in plots.
# Load packages needed to simulate RNA-seq counts with polyester
library(SummarizedBenchmark)
library(genefilter)
library(limma)
library(polyester)
library(DESeq2)
library(data.table)
library(magrittr)
library(dplyr)
library(ggplot2)
library(cowplot)
library(tibble)
library(ggthemes)
library(readr)
# load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
source(f)
}
# set up results directories
datdir <- "yeast-data"
resdir <- "../../results/RNAseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
# results files that will be generated below
# null
resfile_n5 <- file.path(resdir, "polyester-results-null5.rds")
resfile_n10 <- file.path(resdir, "polyester-results-null10.rds")
# strong covariate
resfile_d5 <- file.path(resdir, "polyester-results-de5.rds")
resfile_d10 <- file.path(resdir, "polyester-results-de10.rds")
# weak covariate
resfile_d5_w <- file.path(resdir, "polyesterW-results-de5.rds")
resfile_d10_w <- file.path(resdir, "polyesterW-results-de10.rds")
# uninformative covariate
resfile_d5_uninfCov <- file.path(resdir, "polyester-results-de5-uninfCov.rds")
resfile_d10_uninfCov <- file.path(resdir, "polyester-results-de10-uninfCov.rds")
# # set up parallel backend:
library(parallel)
nCores <- 20
We use the polyester Bioconductor package to simulate RNA-Seq reads. The package requires a count matrix as input which estimates the mean variance relationship. Here we start with the yeast data with 48 biological replicates in each of two conditions (analyzed in this publication). We’ll use the samples passing QC in the original study that belong to the WT group to estimate mean and variance relationship. We assume the data has already been downloaded (this is carried out in the yeast-simulation.Rmd
vignette).
files <- dir(path = datdir, pattern = "*.bam.gbgout", full.names = TRUE)
sample_names <- sapply(strsplit(dir(path = datdir, pattern = "*.bam.gbgout"), "_MID"),
function(x) x[[1]])
badreps <- read_tsv(file.path(datdir, "badreps.txt"), col_names = FALSE)$X1
badreps <- unlist(lapply(strsplit(badreps, "_MID"), function(x) x[1]))
counts <- files %>%
purrr::map(read_tsv, col_names = FALSE) %>% # read in all the files individually
purrr::map2(sample_names, ~ dplyr:::rename(.x, !! .y := X2, feature = X1) ) %>% # add sample names
purrr::reduce(left_join, by = "feature") %>% # reduce with rbind into one dataframe
dplyr::select(-badreps ) %>% # remove badreps
data.frame() %>%
dplyr::select(contains("WT")) %>%
as.matrix
# filter low count genes
counts <- counts[rowMeans(counts) > 1,]
# normalize
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = tibble(sample=colnames(counts)),
design = ~1)
dds <- estimateSizeFactors(dds)
counts <- counts(dds, normalize=TRUE)
## Estimate the zero inflated negative binomial parameters
paramPolyester = get_params(counts)
Next, we’ll simulate RNA-Seq samples, both with and without the addition of simulated DE genes. Here we’ll create a function that we can use to run one replicate given sample size and number of DE gene settings. This will be looped over many replications and results averaged over them.
#' @param sampleSize is the number of samples in each condition
#' @param nDE is the number of DE genes
#' @param bd is the bench design object
#' @param uninformativeCovariate logical indicating whether to use an uninformative
#' covariate (default is FALSE)
#' @param strongCovariate logical indicating whether to use a strongly informative
#' covariate (default is TRUE)
#' @param rseed random seed
#' @param ngenes the number of genes to simulate. If this number is greater than
#' the number of genes in the count matrix, then the polyester parameters will be
#' drawn with replacement from the available pool in order to obtain the requested
#' number of genes.
simulateOneSplit <- function(X, rseed,
nDE, sampleSize, bd, uninformativeCovariate = FALSE,
ngenes = nrow(counts),
strongCovariate = TRUE){
# set random seed
set.seed(as.numeric(X)*as.numeric(rseed))
covar <- 1 / (1+exp(-runif(nrow(counts), 0, 10) + 5))
# things needed to simulate with Polyester
groupPolyester = rep(c(0,1),each=sampleSize)
modPolyester = model.matrix(~-1 + groupPolyester)
# things needed for differential testing using DESeq2
groupDESeq = factor(rep(c(0,1), each=sampleSize))
gs <- sample(1:nrow(counts), ngenes, replace = as.logical(ngenes > nrow(counts)))
mu <- paramPolyester$mu[gs]
p0 <- paramPolyester$p0[gs]
# Null genes
log2FC = rep(0, ngenes)
dat_alt = create_read_numbers(mu,
paramPolyester$fit,
p0,
n=sampleSize*2, beta=cbind(log2FC),
mod=modPolyester)
# filter genes with mostly zeroes
pzero = rowSums(dat_alt==0)/ncol(dat_alt)
dat_alt <- dat_alt[pzero < 0.5,]
truth <- rep(FALSE, nrow(dat_alt))
log2FC <- log2FC[pzero < 0.5]
mu <- mu[pzero < 0.5]
p0 <- p0[pzero < 0.5]
covar <- covar[pzero < 0.5]
# DE simulation
if(nDE > 0){
DE <- sample(1:nrow(dat_alt), nDE, prob = covar)
truth[DE] <- TRUE
# randomly sample a log2FC from N(0,1)
log2FC[DE] <- rnorm(nDE, sd = 1)
# polyester assumes FC is on the log scale
dat_alt[DE,] = create_read_numbers(mu[DE], paramPolyester$fit, p0[DE],
n=sampleSize*2, beta=cbind(log(2^log2FC[DE])),
mod=modPolyester)
}
# run DESeq2
dds_alt <- DESeqDataSetFromMatrix(countData=dat_alt,
colData = DataFrame(groupDESeq),
design=~groupDESeq)
dds_alt <- DESeq(dds_alt, parallel = FALSE)
res_alt <- results(dds_alt, independentFiltering = F)
geneExp <- tbl_df(data.frame(pval=res_alt$pvalue, SE=res_alt$lfcSE,
ind_covariate = covar,
effect_size=res_alt$log2FoldChange,
test_statistic=res_alt$stat, qvalue=truth))
if (uninformativeCovariate){
geneExp <- mutate(geneExp, ind_covariate = runif(length(covar)))
}else if(!strongCovariate){
geneExp <- mutate(geneExp, ind_covariate = pmin(1, abs(covar + rnorm(length(covar), 0, 0.25))))
}
# filter NAs
geneExp <- geneExp %>% na.omit()
# built Benchmark data
sb <- bd %>% buildBench(data=geneExp, parallel = FALSE,
truthCols = "qvalue",
ftCols = "ind_covariate")
sb <- addDefaultMetrics(sb)
rowData(sb)$log2FC <- geneExp$effect_size
return(sb)
}
We’ll also set some parameters that will be common to all simulations. These include the number of replications, the bench design object, the set of methods to exclude in the results plots, and the alpha cutoff level to be used when plotting the aggregated Upset results.
B <- 100
bd <- initializeBenchDesign() # only needs to be done once
##
## Attaching package: 'IHW'
## The following object is masked from 'package:ggplot2':
##
## alpha
##
## Attaching package: 'ashr'
## The following object is masked from 'package:mclust':
##
## dens
## Loading required package: fda
## Loading required package: splines
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
## Loading required package: BayesLogit
## Loading required package: mvtnorm
excludeSet <- c("unadjusted", "bl-df02", "bl-df04", "bl-df05")
ualpha <- 0.05
We also add in Scott’s FDR Regression (both nulltype = "empirical"
and nulltype = "theoretical"
) since our test statistics are approximately t-distributed.
bd <- addBMethod(bd, "fdrreg-t",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'theoretical',
control = list(lambda = 0.01))
bd <- addBMethod(bd, "fdrreg-e",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'empirical',
control = list(lambda = 0.01))
Here’s a helper function to return the number of methods with rejections at a particular alpha level (this helps us determine whether or not to plot the aggregated upset plot - if there aren’t at least 2 methods it will throw an error, which is a problem for the null simulations).
# To be included in the upset agg plot, method needs to have found on average
# at least one rejection per replicate. To create an upset plot, require that
# at least two methods rejected at this threshold.
#' @param res standardized metric data.table generated using
#' standardize_results.
#' @param alpha alpha cutoff
#' @param filterSet which methods to exclude from consideration
numberMethodsReject <- function(res, alphacutoff, filterSet){
res <- res %>%
filter(is.na(param.alpha) | (param.alpha == alphacutoff)) %>%
filter(!(blabel %in% filterSet)) %>%
filter(alpha == alphacutoff) %>%
filter(performanceMetric == "rejections") %>%
select(blabel, performanceMetric, value) %>%
group_by(blabel) %>%
summarize(mean_value = mean(value)) %>%
filter(mean_value > 1)
return(nrow(res))
}
Here we run a null comparison 5 versus 5 RNA-Seq samples. This will be done for 100 simulations.
sampleSize <- 5
nDE <- 0
rseed <- 225
if (!file.exists(resfile_n5)){
null5 <- mclapply(X=1:B, rseed=rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize, bd=bd, mc.cores = nCores)
saveRDS(null5, file=resfile_n5)
} else {
null5 <- readRDS(file=resfile_n5)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(null5, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res5 <- plotsim_standardize(null5, alpha = seq(0.01, 0.10, 0.01))
# metrics = ""TPR" "FPR" "TNR" "FNR" "rejections" "FWER" "rejectprop"
plotsim_average(res5, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5, met="TNR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
covariateLinePlot(null5, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(null5, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res5, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(null5, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha,
"; skipping upset plot")
}
## Not enough methods found rejections at alpha 0.05; skipping upset plot
Here we’ll repeat the above, but for a null comparison 10 versus 10 samples. This will be done for 100 simulations.
sampleSize <- 10
nDE <- 0
rseed <- 837
if (!file.exists(resfile_n10)){
null10 <- mclapply(X=1:B, rseed=837, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize, bd=bd, mc.cores=nCores)
saveRDS(null10, file=resfile_n10)
} else {
null10 <- readRDS(file=resfile_n10)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(null10, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res10 <- plotsim_standardize(null10, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res10, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10, met="TNR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
covariateLinePlot(null10, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(null10, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res10, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(null10, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
## Not enough methods found rejections at alpha 0.05; skipping upset plot
Here we’ll repeat the above, but for a DE comparison 5 versus 5 samples and 2000 DE genes are added. This will be done for 100 simulations.
sampleSize <- 5
nDE <- 2000
rseed <- 198
if (!file.exists(resfile_d5)){
de5 <- mclapply(X=1:B, rseed = rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize, bd=bd, mc.cores=nCores)
saveRDS(de5, file=resfile_d5)
} else {
de5 <- readRDS(file=resfile_d5)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(de5, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res5d <- plotsim_standardize(de5, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res5d, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met="FDR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met="TPR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met=c("FPR", "TPR"), filter_set = excludeSet,
merge_ihw = TRUE)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
covariateLinePlot(de5, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(de5, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res5d, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(de5, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
Here we’ll repeat the above, but for a DE comparison 10 versus 10 samples and 2000 DE genes are added. This will be done for 100 simulations.
sampleSize <- 10
nDE <- 2000
rseed <- 961
if (!file.exists(resfile_d10)){
de10 <- mclapply(X=1:B, rseed = rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize, bd=bd, mc.cores = nCores)
saveRDS(de10, file=resfile_d10)
} else {
de10 <- readRDS(file=resfile_d10)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(de10, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res10d <- plotsim_standardize(de10, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res10d, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met="FDR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met="TPR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met=c("FPR", "TPR"), filter_set = excludeSet,
merge_ihw = TRUE)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
covariateLinePlot(de10, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(de10, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res10d, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(de10, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
Here we’ll repeat the above, but using a weakly informative covariate.
sampleSize <- 5
nDE <- 2000
rseed <- 198
if (!file.exists(resfile_d5_w)){
de5 <- mclapply(X=1:B, rseed = rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize, bd=bd,
strongCovariate = FALSE, mc.cores=nCores)
saveRDS(de5, file=resfile_d5_w)
} else {
de5 <- readRDS(file=resfile_d5_w)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(de5, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res5d <- plotsim_standardize(de5, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res5d, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met="FDR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met="TPR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met=c("FPR", "TPR"), filter_set = excludeSet,
merge_ihw = TRUE)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
covariateLinePlot(de5, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(de5, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res5d, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(de5, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
Here we’ll repeat the above, but for a DE comparison 10 versus 10 samples.
sampleSize <- 10
nDE <- 2000
rseed <- 961
if (!file.exists(resfile_d10_w)){
de10 <- mclapply(X=1:B, rseed = rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize, bd=bd,
strongCovariate = FALSE,
mc.cores = nCores)
saveRDS(de10, file=resfile_d10_w)
} else {
de10 <- readRDS(file=resfile_d10_w)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(de10, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res10d <- plotsim_standardize(de10, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res10d, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met="FDR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met="TPR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met=c("FPR", "TPR"), filter_set = excludeSet,
merge_ihw = TRUE)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
covariateLinePlot(de10, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(de10, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res10d, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(de10, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
Here we’ll repeat the above, but using a completely uninformative covariate.
sampleSize <- 5
nDE <- 2000
rseed <- 198
if (!file.exists(resfile_d5_uninfCov)){
de5 <- mclapply(X=1:B, rseed=rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize,
bd=bd, uninformativeCovariate = TRUE, mc.cores=nCores)
saveRDS(de5, file=resfile_d5_uninfCov)
} else {
de5 <- readRDS(file=resfile_d5_uninfCov)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(de5, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res5d <- plotsim_standardize(de5, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res5d, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met="FDR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met="TPR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res5d, met=c("FPR", "TPR"), filter_set = excludeSet,
merge_ihw = TRUE)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
covariateLinePlot(de5, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(de5, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res5d, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(de5, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
Here we’ll repeat the above, but for a DE comparison 10 versus 10 samples.
sampleSize <- 10
nDE <- 2000
rseed <- 961
if (!file.exists(resfile_d10_uninfCov)){
de10 <- mclapply(X=1:B, rseed=rseed, FUN=simulateOneSplit,
nDE=nDE, sampleSize=sampleSize,
bd=bd, uninformativeCovariate = TRUE, mc.cores = nCores)
saveRDS(de10, file=resfile_d10_uninfCov)
} else {
de10 <- readRDS(file=resfile_d10_uninfCov)
}
Plot results.
# Check for missing results (if any methods threw an error for relevant metrics).
rowSums(sapply(de10, function(x) colSums(is.na(assays(x)$qvalue)) > 0))
## unadjusted bonf bh qvalue ihw-a01 ihw-a02
## 0 0 0 0 0 0
## ihw-a03 ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08
## 0 0 0 0 0 0
## ihw-a09 ihw-a10 ashq bl-df02 bl-df03 bl-df04
## 0 0 0 0 0 0
## bl-df05 lfdr adapt-glm fdrreg-t fdrreg-e
## 0 0 0 0 0
res10d <- plotsim_standardize(de10, alpha = seq(0.01, 0.10, 0.01))
plotsim_average(res10d, met="rejections",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met="FDR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met="TPR",filter_set = excludeSet,
merge_ihw = TRUE, errorBars=TRUE)
plotsim_average(res10d, met=c("FPR", "TPR"), filter_set = excludeSet,
merge_ihw = TRUE)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
covariateLinePlot(de10, alpha=0.05, covname="log2FC", nbins=25,
trans="log1p")
covariateLinePlot(de10, alpha=0.05, covname="ind_covariate", nbins=25,
trans="log1p")
if (numberMethodsReject(res10d, alphacutoff=ualpha, filterSet=excludeSet) >= 2){
aggupset(de10, alpha=ualpha, supplementary = FALSE, return_list = FALSE)
} else {
message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot")
}
Here we compare the method ranks for the different sample sizes and informative covariate settings at alpha = 0.10.
plotMethodRanks(c(resfile_d5, resfile_d5_w, resfile_d5_uninfCov,
resfile_d10, resfile_d10_w, resfile_d10_uninfCov),
colLabels = c("DE5 S", "DE5 W", "DE5 U", "DE10 S", "DE10 W", "DE10 U"),
alpha = 0.10, xlab = "Comparison",
excludeMethods = NULL)
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] splines parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 adaptMT_1.0.0
## [3] FDRreg_0.2-1 mvtnorm_1.0-8
## [5] BayesLogit_0.6 fda_2.4.7
## [7] Matrix_1.2-14 fdrtool_1.2.15
## [9] swfdr_1.6.0 qvalue_2.12.0
## [11] ashr_2.2-7 IHW_1.8.0
## [13] readr_1.1.1 ggthemes_3.5.0
## [15] tibble_1.4.2 cowplot_0.9.2
## [17] magrittr_1.5 data.table_1.11.4
## [19] DESeq2_1.20.0 polyester_1.16.0
## [21] limma_3.36.2 genefilter_1.62.0
## [23] SummarizedBenchmark_0.99.1 dplyr_0.7.5
## [25] mclust_5.4 ggplot2_3.0.0
## [27] stringr_1.3.1 rlang_0.2.1
## [29] UpSetR_1.3.3 SummarizedExperiment_1.10.1
## [31] DelayedArray_0.6.1 BiocParallel_1.14.2
## [33] matrixStats_0.53.1 Biobase_2.40.0
## [35] GenomicRanges_1.32.3 GenomeInfoDb_1.16.0
## [37] IRanges_2.14.10 S4Vectors_0.18.3
## [39] BiocGenerics_0.26.0 tidyr_0.8.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 rprojroot_1.3-2 htmlTable_1.12
## [4] XVector_0.20.0 ggdendro_0.1-20 base64enc_0.1-3
## [7] logspline_2.1.11 rstudioapi_0.7 bit64_0.9-7
## [10] AnnotationDbi_1.42.1 mosaic_1.2.0 codetools_0.2-15
## [13] mnormt_1.5-5 pscl_1.5.2 doParallel_1.0.11
## [16] geneplotter_1.58.0 knitr_1.20 Formula_1.2-3
## [19] broom_0.4.4 annotate_1.58.0 cluster_2.0.7-1
## [22] compiler_3.5.0 backports_1.1.2 assertthat_0.2.0
## [25] lazyeval_0.2.1 acepack_1.4.1 htmltools_0.3.6
## [28] tools_3.5.0 gtable_0.2.0 glue_1.2.0
## [31] GenomeInfoDbData_1.1.0 reshape2_1.4.3 Rcpp_0.12.17
## [34] slam_0.1-43 Biostrings_2.48.0 nlme_3.1-137
## [37] iterators_1.0.9 psych_1.8.4 mosaicCore_0.5.0
## [40] XML_3.98-1.11 zlibbioc_1.26.0 MASS_7.3-50
## [43] scales_0.5.0 hms_0.4.2 RColorBrewer_1.1-2
## [46] yaml_2.1.19 mosaicData_0.16.0 memoise_1.1.0
## [49] gridExtra_2.3 rpart_4.1-13 latticeExtra_0.6-28
## [52] stringi_1.2.2 RSQLite_2.1.1 SQUAREM_2017.10-1
## [55] foreach_1.4.4 checkmate_1.8.5 truncnorm_1.0-8
## [58] pkgconfig_2.0.1 bitops_1.0-6 evaluate_0.10.1
## [61] lattice_0.20-35 lpsymphony_1.8.0 purrr_0.2.5
## [64] bindr_0.1.1 labeling_0.3 htmlwidgets_1.2
## [67] bit_1.1-14 tidyselect_0.2.4 ggformula_0.7.0
## [70] plyr_1.8.4 R6_2.2.2 Hmisc_4.1-1
## [73] DBI_1.0.0 pillar_1.2.3 foreign_0.8-70
## [76] withr_2.1.2 survival_2.41-3 RCurl_1.95-4.10
## [79] nnet_7.3-12 rmarkdown_1.10 locfit_1.5-9.1
## [82] grid_3.5.0 blob_1.1.1 digest_0.6.15
## [85] xtable_1.8-2 munsell_0.4.3