Nothing
#' Get precalculated statistical test weights
#'
#' This function returns pre-calculated weights for human, chimpanzee, mouse,
#' fruitfly and arabidopsis based on the performance of simulated datasets estimated
#' from real data from the ReCount database (\url{http://bowtie-bio.sourceforge.net/recount/}).
#' Currently pre-calculated weights are available only when all six statistical
#' tests are used and for normalization with EDASeq. For other combinations, use
#' the \code{\link{estimate.aufc.weights}} function.
#'
#' @param org \code{"human"}, \code{"mouse"}, \code{"chimpanzee"}, \code{"fruitfly"}
#' or \code{"arabidopsis"}.
#' @return A named vector of convex weights.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' wh <- get.weights("human",c("deseq","edger","noiseq"))
#}
get.weights <- function(org=c("human","chimpanzee","mouse","fruitfly",
"arabidopsis")) {
org <- tolower(org)
check.text.args("org",org,c("human","chimpanzee","mouse","fruitfly",
"arabidopsis"))
switch(org,
human = {
return(c(
deseq=0.05772458,
edger=0.14321672,
limma=0.34516089,
nbpseq=0.06108182,
noiseq=0.11595169,
bayseq=0.27686431
))
},
chimpanzee = {
return(c(
deseq=0.06026782,
edger=0.14964358,
limma=0.33500306,
nbpseq=0.05814585,
noiseq=0.11337043,
bayseq=0.28356925
))
},
mouse = {
return(c(
deseq=0.05257695,
edger=0.24161354,
limma=0.29957277,
nbpseq=0.04914485,
noiseq=0.06847809,
bayseq=0.28861381
))
},
fruitfly = {
return(c(
deseq=0.01430269,
edger=0.12923339,
limma=0.38315685,
nbpseq=0.01265952,
noiseq=0.06778537,
bayseq=0.39286218
))
},
arabidopsis = {
return(c(
deseq=0.04926122,
edger=0.10130858,
limma=0.40842011,
nbpseq=0.04596652,
noiseq=0.09336509,
bayseq=0.30167848
))
},
chimp = {
return(c(
deseq=NULL,
edger=NULL,
limma=NULL,
nbpseq=NULL,
noiseq=NULL,
bayseq=NULL
))
}
)
}
#' Default parameters for several metaseqr functions
#'
#' This function returns a list with the default settings for each filtering,
#' statistical and normalization algorithm included in the metaseqR package.
#' See the documentation of the main function and the documentation of each
#' statistical and normalization method for details.
#'
#' @param what a keyword determining the procedure for which to fetch the default
#' settings according to method parameter. It can be one of \code{"normalization"},
#' \code{"statistics"}, \code{"gene.filter"}, \code{"exon.filter"} or
#' \code{"biotype.filter"}.
#' @param method the supported algorithm included in metaseqR for which to fetch
#' the default settings. When \code{what} is \code{"normalization"}, method is one
#' of \code{"edaseq"}, \code{"deseq"}, \code{"edger"}, \code{"noiseq"} or
#' \code{"nbpseq"}. When \code{what} is \code{"statistics"}, method is one of
#' \code{"deseq"}, \code{"edger"}, \code{"noiseq"}, \code{"bayseq"}, \code{"limma"}
#' or \code{"nbpseq"}. When \code{method} is \code{"biotype.filter"}, \code{what}
#' is the input organism (see the main \code{\link{metaseqr}} help page for a list
#' of supported organisms).
#' @return A list with default setting that can be used directly in the call of
#' metaseqr.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' norm.args.edaseq <- get.defaults("normalization","edaseq")
#' stat.args.edger <- get.defaults("statistics","edger")
#'}
get.defaults <- function(what,method=NULL) {
if (what %in% c("normalization","statistics") && is.null(method))
stopwrap("The method argument must be provided when what is ",
"\"normalization\" or \"statistics\"!")
switch(what,
normalization = {
switch(method,
edaseq = {
return(list(within.which="loess",between.which="full"))
},
deseq = {
return(list(locfunc=median))
},
edger = {
return(list(
method="TMM",refColumn=NULL,logratioTrim=0.3,
sumTrim=0.05,doWeighting=TRUE,Acutoff=-1e10,p=0.75
))
},
noiseq = {
return(list(
method="tmm", # which normalization
long=1000,lc=1,k=1, # common arguments
refColumn=1,logratioTrim=0.3,sumTrim=0.05,
doWeighting=TRUE,Acutoff=-1e+10 # TMM normalization arguments
))
},
nbpseq = {
return(list(main.method="nbsmyth",method="AH2010",
thinning=TRUE))
}
)
},
statistics = {
switch(method,
deseq = {
return(list(method="blind",sharingMode="fit-only",
fitType="local"))
},
edger = {
return(list(
main.method="classic", # classic or glm fit
rowsum.filter=5,prior.df=10,
trend="movingave",span=NULL, # classic estimateCommonDisp arguments
tag.method="grid",grid.length=11,grid.range=c(-6,6), # classic estimateTagwiseDisp arguments
offset=NULL,glm.method="CoxReid",subset=10000, # glm estimateGLMCommonDisp and estimateGLMTrendedDisp arguments
AveLogCPM=NULL,trend.method="auto", # glm estimateGLMTagwiseDisp arguments
dispersion=NULL,offset=NULL,weights=NULL, # glmFit arguments
lib.size=NULL,prior.count=0.125,start=NULL,
method="auto",test="chisq", # glmLRT arguments
abundance.trend=TRUE,robust=FALSE,
winsor.tail.p=c(0.05,0.1) # glmLFTest arguments
))
},
noiseq = {
return(list(
k=0.5,norm="n",replicates="biological",
factor="class",conditions=NULL,pnr=0.2,
nss=5,v=0.02,lc=1, # noiseq general and specific arguments
nclust=15,r=100,adj=1.5,
a0per=0.9,filter=0,depth=NULL,
cv.cutoff=500,cpm=1 # noiseqbio specific arguments
))
},
bayseq = {
return(list(samplesize=10000,samplingSubset=NULL,
equalDispersions=TRUE,estimation="QL",zeroML=FALSE,
consensus=FALSE,moderate=TRUE,pET="BIC",
marginalise=FALSE,subset=NULL,priorSubset=NULL,
bootStraps=1,conv=1e-4,nullData=FALSE,returnAll=FALSE,
returnPD=FALSE,discardSampling=FALSE,cl=NULL))
},
limma = {
return(list(normalize.method="none"))
},
nbpseq = {
return(list(
main.method="nbsmyth",
model=list(nbpseq="log-linear-rel-mean",nbsmyth="NBP"),
tests="HOA",
alternative="two.sided"
))
}
)
},
gene.filter = {
return(list(
length=list(
length=500
),
avg.reads=list(
average.per.bp=100,
quantile=0.75
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.defaults("biotype.filter",method[1]),
presence=list(
frac=0.25,
min.count=10,
per.condition=FALSE
)
))
},
exon.filter = {
return(list(
mnrpx=list(
exons.per.gene=5,
min.exons=2,
frac=1/5
)
))
},
biotype.filter = {
switch(method,
hg18 = {
return(list(
unprocessed_pseudogene=TRUE,
pseudogene=FALSE,
miRNA=FALSE,
retrotransposed=FALSE,
protein_coding=FALSE,
processed_pseudogene=FALSE,
snRNA=FALSE,
snRNA_pseudogene=TRUE,
Mt_tRNA_pseudogene=TRUE,
miRNA_pseudogene=TRUE,
misc_RNA=FALSE,
tRNA_pseudogene=TRUE,
snoRNA=FALSE,
scRNA_pseudogene=TRUE,
rRNA_pseudogene=TRUE,
snoRNA_pseudogene=TRUE,
rRNA=TRUE,
misc_RNA_pseudogene=TRUE,
IG_V_gene=FALSE,
IG_D_gene=FALSE,
IG_J_gene=FALSE,
IG_C_gene=FALSE,
IG_pseudogene=TRUE,
scRNA=FALSE
))
},
hg19 = {
return(list(
pseudogene=FALSE,
lincRNA=FALSE,
protein_coding=FALSE,
antisense=FALSE,
processed_transcript=FALSE,
snRNA=FALSE,
sense_intronic=FALSE,
miRNA=FALSE,
misc_RNA=FALSE,
snoRNA=FALSE,
rRNA=TRUE,
polymorphic_pseudogene=FALSE,
sense_overlapping=FALSE,
three_prime_overlapping_ncrna=FALSE,
TR_V_gene=FALSE,
TR_V_pseudogene=TRUE,
TR_D_gene=FALSE,
TR_J_gene=FALSE,
TR_C_gene=FALSE,
TR_J_pseudogene=TRUE,
IG_C_gene=FALSE,
IG_C_pseudogene=TRUE,
IG_J_gene=FALSE,
IG_J_pseudogene=TRUE,
IG_D_gene=FALSE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE
))
},
hg38 = {
return(list(
protein_coding=FALSE,
polymorphic_pseudogene=FALSE,
lincRNA=FALSE,
unprocessed_pseudogene=TRUE,
processed_pseudogene=FALSE,
antisense=FALSE,
processed_transcript=FALSE,
transcribed_unprocessed_pseudogene=FALSE,
sense_intronic=FALSE,
unitary_pseudogene=TRUE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE,
TR_V_gene=FALSE,
sense_overlapping=FALSE,
transcribed_processed_pseudogene=FALSE,
miRNA=FALSE,
snRNA=FALSE,
misc_RNA=FALSE,
rRNA=TRUE,
snoRNA=FALSE,
IG_J_pseudogene=TRUE,
IG_J_gene=FALSE,
IG_D_gene=FALSE,
three_prime_overlapping_ncrna=FALSE,
IG_C_gene=FALSE,
IG_C_pseudogene=TRUE,
pseudogene=TRUE,
TR_V_pseudogene=TRUE,
Mt_tRNA=TRUE,
Mt_rRNA=TRUE,
translated_processed_pseudogene=FALSE,
TR_J_gene=FALSE,
TR_C_gene=FALSE,
TR_D_gene=FALSE,
TR_J_pseudogene=TRUE,
LRG_gene=FALSE
))
},
mm9 = {
return(list(
pseudogene=FALSE,
snRNA=FALSE,
protein_coding=FALSE,
antisense=FALSE,
miRNA=FALSE,
lincRNA=FALSE,
snoRNA=FALSE,
processed_transcript=FALSE,
misc_RNA=FALSE,
rRNA=TRUE,
sense_overlapping=FALSE,
sense_intronic=FALSE,
polymorphic_pseudogene=FALSE,
non_coding=FALSE,
three_prime_overlapping_ncrna=FALSE,
IG_C_gene=FALSE,
IG_J_gene=FALSE,
IG_D_gene=FALSE,
IG_V_gene=FALSE,
ncrna_host=FALSE
))
},
mm10 = {
return(list(
pseudogene=FALSE,
snRNA=FALSE,
protein_coding=FALSE,
antisense=FALSE,
miRNA=FALSE,
snoRNA=FALSE,
lincRNA=FALSE,
processed_transcript=FALSE,
misc_RNA=FALSE,
rRNA=TRUE,
sense_intronic=FALSE,
sense_overlapping=FALSE,
polymorphic_pseudogene=FALSE,
IG_C_gene=FALSE,
IG_J_gene=FALSE,
IG_D_gene=FALSE,
IG_LV_gene=FALSE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE,
TR_V_gene=FALSE,
TR_V_pseudogene=TRUE,
three_prime_overlapping_ncrna=FALSE
))
},
dm3 = {
return(list(
protein_coding=FALSE,
ncRNA=FALSE,
snoRNA=FALSE,
pre_miRNA=FALSE,
pseudogene=FALSE,
snRNA=FALSE,
tRNA=FALSE,
rRNA=TRUE
))
},
rn5 = {
return(list(
protein_coding=FALSE,
pseudogene=FALSE,
processed_pseudogene=FALSE,
miRNA=FALSE,
rRNA=TRUE,
misc_RNA=FALSE
))
},
rn6 = {
return(list(
antisense=FALSE,
lincRNA=FALSE,
miRNA=FALSE,
misc_RNA=FALSE,
processed_pseudogene=FALSE,
processed_transcript=FALSE,
protein_coding=FALSE,
pseudogene=FALSE,
ribozyme=FALSE,
rRNA=TRUE,
scaRNA=FALSE,
sense_intronic=FALSE,
snoRNA=FALSE,
snRNA=FALSE,
sRNA=FALSE,
TEC=FALSE,
transcribed_processed_pseudogene=FALSE,
transcribed_unprocessed_pseudogene=FALSE,
unprocessed_pseudogene=FALSE
))
},
danrer7 = {
return(list(
antisense=FALSE,
protein_coding=FALSE,
miRNA=FALSE,
snoRNA=FALSE,
rRNA=TRUE,
lincRNA=FALSE,
processed_transcript=FALSE,
snRNA=FALSE,
pseudogene=FALSE,
sense_intronic=FALSE,
misc_RNA=FALSE,
polymorphic_pseudogene=FALSE,
IG_V_pseudogene=TRUE,
IG_C_pseudogene=TRUE,
IG_J_pseudogene=TRUE,
non_coding=FALSE,
sense_overlapping=FALSE
))
},
pantro4 = {
return(list(
protein_coding=FALSE,
pseudogene=FALSE,
processed_pseudogene=FALSE,
miRNA=FALSE,
rRNA=TRUE,
snRNA=FALSE,
snoRNA=FALSE,
misc_RNA=FALSE
))
},
susscr3 = {
return(list(
antisense=FALSE,
protein_coding=FALSE,
lincRNA=FALSE,
pseudogene=FALSE,
processed_transcript=FALSE,
miRNA=FALSE,
rRNA=TRUE,
snRNA=FALSE,
snoRNA=FALSE,
misc_RNA=FALSE,
non_coding=FALSE,
IG_C_gene=FALSE,
IG_J_gene=FALSE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE
))
},
tair10 = {
return(list(
miRNA=FALSE,
ncRNA=FALSE,
protein_coding=FALSE,
pseudogene=FALSE,
rRNA=TRUE,
snoRNA=FALSE,
snRNA=FALSE,
transposable_element=FALSE,
tRNA=FALSE
))
},
equcab2 = {
return(list(
miRNA=FALSE,
misc_RNA=FALSE,
protein_coding=FALSE,
pseudogene=FALSE,
processed_pseudogene=FALSE,
rRNA=TRUE,
snoRNA=FALSE,
snRNA=FALSE
))
}
)
}
)
}
#' Validate normalization and statistical algorithm arguments
#'
#' This function checks and validates the arguments passed by the user to the
#' normalization and statistics algorithms supported by metaseqR. As these are
#' given into lists and passed to the algorithms, the list members must be checked
#' for \code{NULL}, valid names etc. This function performs these checks and
#' ignores any invalid arguments.
#'
#' @param normalization a keyword determining the normalization strategy to be
#' performed by metaseqR. See \code{\link{metaseqr}} main help page for details.
#' @param statistics the statistical tests to be performed by metaseqR. See
#' \code{\link{metaseqr}} main help page for details.
#' @param norm.args the user input list of normalization arguments. See
#' \code{\link{metaseqr}} main help page for details.
#' @param stat.args the user input list of statistical test arguments. See
#' \code{\link{metaseqr}} main help page for details.
#' @return A list with two members (\code{norm.args}, \code{stat.args}) with valid
#' arguments to be used as user input for the algorithms supported by metaseqR.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' normalization <- "edaseq"
#' statistics <- "edger"
#' norm.args <- get.defaults("normalization","edaseq")
#' stat.args <- get.defaults("statistics","deseq")
#' # Will return as is
#' val <- validate.alg.args(normalization,statistics,norm.args,stat.args)
#' val$norm.args
#' val$stat.args
#' # but...
#' stat.args <- c(stat.args,my.irrelevant.arg=999)
#' val <- validate.alg.args(normalization,statistics,norm.args,stat.args)
#' # irrelevant argument will be removed
#' val$norm.args
#' val$stat.args
#'}
validate.alg.args <- function(normalization,statistics,norm.args,stat.args) {
if (normalization=="each") {
if (!is.null(norm.args)) {
for (s in statistics) {
if (!is.null(norm.args[[s]])) {
switch(s,
deseq = {
tmp <- norm.args[[s]]
tmp <- validate.list.args("normalization",s,tmp)
norm.args[[s]] <- get.defaults("normalization",s)
if (length(tmp)>0)
norm.args[[s]] <- set.arg(norm.args[[s]],tmp)
},
edger = {
tmp <- norm.args[[s]]
tmp <- validate.list.args("statistics",s,tmp)
norm.args[[s]] <- get.defaults("normalization",s)
if (length(tmp)>0)
norm.args[[s]] <- set.arg(norm.args[[s]],tmp)
},
limma = {
tmp <- norm.args[[s]]
tmp <- validate.list.args("statistics","edger",tmp)
norm.args[[s]] <- get.defaults("normalization",
"edger")
if (length(tmp)>0)
norm.args[[s]] <- set.arg(norm.args[[s]],tmp)
},
nbpseq = {
tmp <- norm.args[[s]]
tmp <- validate.list.args("statistics",s,tmp)
norm.args[[s]] <- get.defaults("normalization",s)
if (length(tmp)>0)
norm.args[[s]] <- set.arg(norm.args[[s]],tmp)
},
noiseq = {
tmp <- norm.args[[s]]
tmp <- validate.list.args("statistics",s,tmp)
norm.args[[s]] <- get.defaults("normalization",s)
if (length(tmp)>0)
norm.args[[s]] <- set.arg(norm.args[[s]],tmp)
},
bayseq = {
tmp <- norm.args[[s]]
tmp <- validate.list.args("statistics","edger",tmp)
norm.args[[s]] <- get.defaults("normalization",
"edger")
if (length(tmp)>0)
norm.args[[s]] <- set.arg(norm.args[[s]],tmp)
}
)
}
else {
switch(s,
deseq = {
norm.args[[s]] <- get.defaults(normalization,
"deseq")
},
edger = {
norm.args[[s]] <- get.defaults(normalization,
"edger")
},
limma = {
norm.args[[s]] <- get.defaults(normalization,
"edger")
},
nbpseq = {
norm.args[[s]] <- get.defaults(normalization,
"nbpseq")
},
noiseq = {
norm.args[[s]] <- get.defaults(normalization,
"noiseq")
},
bayseq = {
norm.args[[s]] <- get.defaults(normalization,
"edger")
}
)
}
}
}
else {
norm.args <- vector("list",length(statistics))
names(norm.args) <- statistics
for (s in statistics) {
switch(s,
deseq = {
norm.args[[s]] <- get.defaults(normalization,"deseq")
},
edger = {
norm.args[[s]] <- get.defaults(normalization,"edger")
},
limma = {
norm.args[[s]] <- get.defaults(normalization,"edger")
},
nbpseq = {
norm.args[[s]] <- get.defaults(normalization,"nbpseq")
},
noiseq = {
norm.args[[s]] <- get.defaults(normalization,"noiseq")
},
bayseq = {
norm.args[[s]] <- get.defaults(normalization,"edger")
}
)
}
}
}
else {
if (!is.null(norm.args))
{
tmp <- norm.args
tmp <- validate.list.args("normalization",normalization,tmp)
norm.args <- get.defaults("normalization",normalization)
if (length(tmp)>0)
norm.args <- set.arg(norm.args,tmp)
}
else
norm.args <- get.defaults("normalization",normalization)
}
for (s in statistics) {
if (!is.null(stat.args[[s]])) {
tmp <- stat.args[[s]]
tmp <- validate.list.args("statistics",s,tmp)
stat.args[[s]] <- get.defaults("statistics",s)
if (length(tmp)>0)
stat.args[[s]] <- set.arg(stat.args[[s]],tmp)
}
else
stat.args[[s]] <- get.defaults("statistics",s)
}
return(list(norm.args=norm.args,stat.args=stat.args))
}
#' Validate list parameters for several metaseqR functions
#'
#' This function validates the arguments passed by the user to the normalization,
#' statistics and filtering algorithms supported by metaseqR. As these are given
#' into lists and passed to the algorithms, the list member names must be valid
#' algorithm arguments for the pipeline not to crash. This function performs these
#' checks and ignores any invalid arguments.
#'
#' @param what a keyword determining the procedure for which to validate arguments.
#' It can be one of \code{"normalization"}, \code{"statistics"}, \code{"gene.filter"},
#' \code{"exon.filter"} or \code{"biotype.filter"}.
#' @param method the normalization/statistics/filtering algorithm included in
#' metaseqR for which to validate user input. When \code{what} is
#' \code{"normalization"}, method is one of \code{"edaseq"}, \code{"deseq"},
#' \code{"edger"}, \code{"noiseq"} or \code{"nbpseq"}. When \code{what} is
#' \code{"statistics"}, method is one of \code{"deseq"}, \code{"edger"},
#' \code{"noiseq"}, \code{"bayseq"}, \code{"limma"} or \code{"nbpseq"}. When
#' \code{method} is \code{"biotype.filter"}, \code{what} is the input organism
#' (see the main \code{\link{metaseqr}} help page for a list of supported organisms).
#' @param arg.list the user input list of arguments.
#' @return A list with valid arguments to be used as user input in the algorithms
#' supported by metaseqR.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' norm.args.edger <- list(method="TMM",refColumn=NULL,
#' logratioTrim=0.3,sumTrim=0.05,doWeighting=TRUE,
#' Bcutoff=-1e10,p=0.75)
#' # Bcutoff does not exist, will throw a warning and ignore it.
#' norm.args.edger <- validate.list.args("normalization","edger",norm.args.edger)
#'}
validate.list.args <- function(what,method=NULL,arg.list) {
what <- tolower(what)
check.text.args("what",what,c("normalization","statistics","gene.filter",
"exon.filter","biotype.filter"))
if (what %in% c("normalization","statistics") && is.null(method))
stopwrap("The method argument must be provided when what is ",
"\"normalization\" or \"statistics\"!")
switch(what,
normalization = {
switch(method,
edaseq = {
valid <- names(arg.list) %in% c("within.which",
"between.which")
not.valid <- which(!valid)
},
deseq = {
valid <- names(arg.list) %in% c("locfunc")
not.valid <- which(!valid)
},
edger = {
valid <- names(arg.list) %in% c("method","refColumn",
"logratioTrim","sumTrim","doWeighting","Acutoff","p")
not.valid <- which(!valid)
},
noiseq = {
valid <- names(arg.list) %in% c("method","long","lc","k",
"refColumn","logratioTrim","sumTrim","doWeighting",
"Acutoff")
not.valid <- which(!valid)
},
nbpseq = {
valid <- names(arg.list) %in% c("main.method","method",
"thinning")
not.valid <- which(!valid)
}
)
if (length(not.valid)>0) {
warnwrap(paste("The following",method,what,"argument names",
"are invalid and will be ignored:",
paste(names(arg.list)[not.valid],collapse=", ")))
arg.list[not.valid] <- NULL
}
return(arg.list)
},
statistics = {
switch(method,
deseq = {
valid <- names(arg.list) %in% c("method","sharingMode",
"fitType")
not.valid <- which(!valid)
},
edger = {
valid <- names(arg.list) %in% c("main.method",
"rowsum.filter","prior.df","trend","span","tag.method",
"grid.length","grid.range","offset","glm.method",
"subset","AveLogCPM","trend.method","dispersion",
"offset","weights","lib.size","prior.count","start",
"method","abundance.trend","robust",
"winsor.tail.p")
not.valid <- which(!valid)
},
noiseq = {
valid <- names(arg.list) %in% c("k","norm","replicates",
"factor","conditions","pnr","nss","v","lc","nclust","r",
"adj","a0per","filter","depth","cv.cutoff","cpm")
not.valid <- which(!valid)
},
bayseq = {
valid <- names(arg.list) %in% c("samplesize",
"samplingSubset","equalDispersions","estimation",
"zeroML","consensus","moderate","pET","marginalise",
"subset","priorSubset","bootStraps","conv","nullData",
"returnAll","returnPD","discardSampling","cl")
not.valid <- which(!valid)
},
limma = {
valid <- names(arg.list) %in% c("normalize.method")
not.valid <- which(!valid)
},
nbpseq = {
valid <- names(arg.list) %in% c("main.method","method",
"tests","alternative")
not.valid <- which(!valid)
}
)
if (length(not.valid)>0) {
warnwrap(paste("The following",method,what,"argument names",
"are invalid and will be ignored:",
paste(names(arg.list)[not.valid],collapse=", ")))
arg.list[not.valid] <- NULL
}
return(arg.list)
},
gene.filter = {
valid.1 <- names(arg.list) %in% c("length","avg.reads","expression",
"biotype","presence")
not.valid.1 <- which(!valid.1)
if (length(not.valid.1)>0) {
warnwrap(paste("The following",method,what,"argument names",
"are invalid and will be ignored:",
paste(names(arg.list)[not.valid.1],collapse=", ")))
arg.list[not.valid.1] <- NULL
}
if (length(arg.list)>0) {
for (n in names(arg.list)) {
switch(n,
length = {
valid.2 <- names(arg.list[[n]]) %in% c("length")
not.valid.2 <- which(!valid.2)
},
avg.reads = {
valid.2 <- names(arg.list[[n]]) %in%
c("average.per.bp","quantile")
not.valid.2 <- which(!valid.2)
},
expression = {
valid.2 <- names(arg.list[[n]]) %in% c("median",
"mean","quantile","known","custom")
not.valid.2 <- which(!valid.2)
},
presence = {
valid.2 <- names(arg.list[[n]]) %in% c("frac",
"min.count","per.condition")
not.valid.2 <- which(!valid.2)
}
)
if (length(not.valid.2)>0) {
warnwrap(paste("The following",method,what,
"sub-argument names are invalid and will be",
"ignored:",paste(names(arg.list[[n]])[not.valid.2],
collapse=", ")))
arg.list[[n]][not.valid.2] <- NULL
}
}
}
return(arg.list)
},
exon.filter = {
valid.1 <- names(arg.list) %in% c("mnrpx")
not.valid.1 <- which(!valid.1)
if (length(not.valid.1)>0) {
warnwrap(paste("The following",method,what,"argument names",
"are invalid and will be ignored:",
paste(names(arg.list)[not.valid.1],collapse=", ")))
arg.list[not.valid.1] <- NULL
}
if (length(arg.list)>0) {
for (n in names(arg.list)) {
switch(n,
mnrpx = {
valid.2 <- names(arg.list[[n]]) %in%
c("exons.per.gene","min.exons","frac")
not.valid.2 <- which(!valid.2)
}
)
if (length(not.valid.2)>0) {
warnwrap(paste("The following",method,what,
"sub-argument names are invalid and will be",
"ignored:",paste(names(arg.list[[n]])[not.valid.2],
collapse=", ")))
arg.list[[n]][not.valid.2] <- NULL
}
}
}
return(arg.list)
},
biotype.filter = {
switch(method,
hg18 = {
valid <- names(arg.list) %in% c("unprocessed_pseudogene",
"pseudogene","miRNA","retrotransposed","protein_coding",
"processed_pseudogene","snRNA","snRNA_pseudogene",
"Mt_tRNA_pseudogene","miRNA_pseudogene","misc_RNA",
"tRNA_pseudogene","snoRNA","scRNA_pseudogene",
"rRNA_pseudogene","snoRNA_pseudogene","rRNA",
"misc_RNA_pseudogene","IG_V_gene","IG_D_gene",
"IG_J_gene","IG_C_gene","IG_pseudogene","scRNA")
not.valid <- which(!valid)
},
hg19 = {
valid <- names(arg.list) %in% c("pseudogene","lincRNA",
"protein_coding","antisense","processed_transcript",
"snRNA","sense_intronic","miRNA","misc_RNA","snoRNA",
"rRNA","polymorphic_pseudogene","sense_overlapping",
"three_prime_overlapping_ncrna","TR_V_gene",
"TR_V_pseudogene","TR_D_gene","TR_J_gene","TR_C_gene",
"TR_J_pseudogene","IG_C_gene","IG_C_pseudogene",
"IG_J_gene","IG_J_pseudogene","IG_D_gene","IG_V_gene",
"IG_V_pseudogene")
not.valid <- which(!valid)
},
mm9 = {
valid <- names(arg.list) %in% c("pseudogene","snRNA",
"protein_coding","antisense","miRNA","lincRNA","snoRNA",
"processed_transcript","misc_RNA","rRNA",
"sense_overlapping","sense_intronic",
"polymorphic_pseudogene","non_coding",
"three_prime_overlapping_ncrna","IG_C_gene","IG_J_gene",
"IG_D_gene","IG_V_gene","ncrna_host")
not.valid <- which(!valid)
},
mm10 = {
valid <- names(arg.list) %in% c("pseudogene","snRNA",
"protein_coding","antisense","miRNA","snoRNA","lincRNA",
"processed_transcript","misc_RNA","rRNA",
"sense_intronic","sense_overlapping",
"polymorphic_pseudogene","IG_C_gene","IG_J_gene",
"IG_D_gene","IG_LV_gene","IG_V_gene","IG_V_pseudogene",
"TR_V_gene","TR_V_pseudogene",
"three_prime_overlapping_ncrna")
not.valid <- which(!valid)
},
dm3 = {
valid <- names(arg.list) %in% c("protein_coding","ncRNA",
"snoRNA","pre_miRNA","pseudogene","snRNA","tRNA","rRNA")
not.valid <- which(!valid)
},
rn5 = {
valid <- names(arg.list) %in% c("protein_coding",
"pseudogene","processed_pseudogene","miRNA","rRNA",
"misc_RNA")
not.valid <- which(!valid)
},
danrer7 = {
valid <- names(arg.list) %in% c("antisense",
"protein_coding","miRNA","snoRNA","rRNA","lincRNA",
"processed_transcript","snRNA","pseudogene",
"sense_intronic","misc_RNA","polymorphic_pseudogene",
"IG_V_pseudogene","IG_C_pseudogene","IG_J_pseudogene",
"non_coding","sense_overlapping")
not.valid <- which(!valid)
},
pantro4 = {
valid <- names(arg.list) %in% c("protein_coding",
"pseudogene","processed_pseudogene","miRNA","rRNA",
"snRNA","snoRNA","misc_RNA")
not.valid <- which(!valid)
},
susscr3 = {
valid <- names(arg.list) %in% c("antisense",
"protein_coding","lincRNA","pseudogene",
"processed_transcript","miRNA","rRNA","snRNA","snoRNA",
"misc_RNA","non_coding","IG_C_gene","IG_J_gene",
"IG_V_gene","IG_V_pseudogene")
not.valid <- which(!valid)
},
tair10 = {
valid <- names(arg.list) %in% c("miRNA","ncRNA",
"protein_coding","pseudogene","rRNA","snoRNA",
"snRNA","transposable_element","tRNA")
not.valid <- which(!valid)
},
equcab2 = {
valid <- names(arg.list) %in% c("miRNA","misc_RNA",
"protein_coding","pseudogene","processed_pseudogene",
"rRNA","snoRNA","snRNA")
}
)
if (length(not.valid)>0) {
warnwrap(paste("The following",method,what,"argument names",
"are invalid and will be ignored:",
paste(names(arg.list)[not.valid],collapse=", ")))
arg.list[not.valid] <- NULL
}
return(arg.list)
}
)
}
#' Group together a more strict biotype filter
#'
#' Returns a list with TRUE/FALSE according to the biotypes that are going to be
#' filtered in a more strict way than the defaults. This is a helper function for
#' the analysis presets of metaseqR. Internal use only.
#'
#' @param org one of the supported organisms.
#' @return A list of booleans, one for each biotype.
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' sf <- get.strict.biofilter("hg18")
#'}
get.strict.biofilter <- function(org) {
switch(org,
hg18 = {
return(list(
unprocessed_pseudogene=TRUE,
pseudogene=TRUE,
miRNA=FALSE,
retrotransposed=FALSE,
protein_coding=FALSE,
processed_pseudogene=TRUE,
snRNA=FALSE,
snRNA_pseudogene=TRUE,
Mt_tRNA_pseudogene=TRUE,
miRNA_pseudogene=TRUE,
misc_RNA=TRUE,
tRNA_pseudogene=TRUE,
snoRNA=TRUE,
scRNA_pseudogene=TRUE,
rRNA_pseudogene=TRUE,
snoRNA_pseudogene=TRUE,
rRNA=TRUE,
misc_RNA_pseudogene=TRUE,
IG_V_gene=FALSE,
IG_D_gene=FALSE,
IG_J_gene=FALSE,
IG_C_gene=FALSE,
IG_pseudogene=TRUE,
scRNA=FALSE
))
},
hg19 = {
return(list(
pseudogene=TRUE,
lincRNA=FALSE,
protein_coding=FALSE,
antisense=FALSE,
processed_transcript=FALSE,
snRNA=FALSE,
sense_intronic=FALSE,
miRNA=FALSE,
misc_RNA=FALSE,
snoRNA=TRUE,
rRNA=TRUE,
polymorphic_pseudogene=TRUE,
sense_overlapping=FALSE,
three_prime_overlapping_ncrna=FALSE,
TR_V_gene=FALSE,
TR_V_pseudogene=TRUE,
TR_D_gene=FALSE,
TR_J_gene=FALSE,
TR_C_gene=FALSE,
TR_J_pseudogene=TRUE,
IG_C_gene=FALSE,
IG_C_pseudogene=TRUE,
IG_J_gene=FALSE,
IG_J_pseudogene=TRUE,
IG_D_gene=FALSE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE
))
},
hg38 = {
return(list(
protein_coding=FALSE,
polymorphic_pseudogene=TRUE,
lincRNA=FALSE,
unprocessed_pseudogene=TRUE,
processed_pseudogene=TRUE,
antisense=FALSE,
processed_transcript=FALSE,
transcribed_unprocessed_pseudogene=TRUE,
sense_intronic=FALSE,
unitary_pseudogene=TRUE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE,
TR_V_gene=FALSE,
sense_overlapping=FALSE,
transcribed_processed_pseudogene=TRUE,
miRNA=FALSE,
snRNA=FALSE,
misc_RNA=FALSE,
rRNA=TRUE,
snoRNA=TRUE,
IG_J_pseudogene=TRUE,
IG_J_gene=FALSE,
IG_D_gene=FALSE,
three_prime_overlapping_ncrna=FALSE,
IG_C_gene=FALSE,
IG_C_pseudogene=TRUE,
pseudogene=TRUE,
TR_V_pseudogene=TRUE,
Mt_tRNA=TRUE,
Mt_rRNA=TRUE,
translated_processed_pseudogene=TRUE,
TR_J_gene=FALSE,
TR_C_gene=FALSE,
TR_D_gene=FALSE,
TR_J_pseudogene=TRUE,
LRG_gene=FALSE
))
},
mm9 = {
return(list(
pseudogene=TRUE,
snRNA=FALSE,
protein_coding=FALSE,
antisense=FALSE,
miRNA=FALSE,
lincRNA=FALSE,
snoRNA=TRUE,
processed_transcript=FALSE,
misc_RNA=TRUE,
rRNA=TRUE,
sense_overlapping=FALSE,
sense_intronic=FALSE,
polymorphic_pseudogene=TRUE,
non_coding=FALSE,
three_prime_overlapping_ncrna=FALSE,
IG_C_gene=FALSE,
IG_J_gene=FALSE,
IG_D_gene=FALSE,
IG_V_gene=FALSE,
ncrna_host=FALSE
))
},
mm10 = {
return(list(
pseudogene=TRUE,
snRNA=FALSE,
protein_coding=FALSE,
antisense=FALSE,
miRNA=FALSE,
snoRNA=TRUE,
lincRNA=FALSE,
processed_transcript=FALSE,
misc_RNA=TRUE,
rRNA=TRUE,
sense_intronic=FALSE,
sense_overlapping=FALSE,
polymorphic_pseudogene=TRUE,
IG_C_gene=FALSE,
IG_J_gene=FALSE,
IG_D_gene=FALSE,
IG_LV_gene=FALSE,
IG_V_gene=FALSE,
IG_V_pseudogene=TRUE,
TR_V_gene=FALSE,
TR_V_pseudogene=TRUE,
three_prime_overlapping_ncrna=FALSE
))
},
dm3 = {
return(list(
protein_coding=FALSE,
ncRNA=FALSE,
snoRNA=TRUE,
pre_miRNA=FALSE,
pseudogene=TRUE,
snRNA=FALSE,
tRNA=FALSE,
rRNA=TRUE
))
},
rn5 = {
return(list(
protein_coding=FALSE,
pseudogene=TRUE,
processed_pseudogene=FALSE,
miRNA=FALSE,
rRNA=TRUE,
misc_RNA=TRUE
))
},
danrer7 = {
return(list(
antisense=FALSE,
protein_coding=FALSE,
miRNA=FALSE,
snoRNA=TRUE,
rRNA=TRUE,
lincRNA=FALSE,
processed_transcript=FALSE,
snRNA=FALSE,
pseudogene=TRUE,
sense_intronic=FALSE,
misc_RNA=TRUE,
polymorphic_pseudogene=TRUE,
IG_V_pseudogene=TRUE,
IG_C_pseudogene=TRUE,
IG_J_pseudogene=TRUE,
non_coding=FALSE,
sense_overlapping=FALSE
))
},
pantro4 = {
return(list(
protein_coding=FALSE,
pseudogene=TRUE,
processed_pseudogene=TRUE,
miRNA=FALSE,
rRNA=TRUE,
snRNA=TRUE,
snoRNA=TRUE,
misc_RNA=TRUE
))
},
susscr3 = {
return(list(
antisense=FALSE,
protein_coding=FALSE,
lincRNA=FALSE,
pseudogene=TRUE,
processed_transcript=FALSE,
miRNA=FALSE,
rRNA=TRUE,
snRNA=TRUE,
snoRNA=TRUE,
misc_RNA=TRUE,
non_coding=FALSE,
IG_C_gene=TRUE,
IG_J_gene=TRUE,
IG_V_gene=TRUE,
IG_V_pseudogene=FALSE
))
},
tair10 = {
return(list(
miRNA=FALSE,
ncRNA=FALSE,
protein_coding=FALSE,
pseudogene=TRUE,
rRNA=TRUE,
snoRNA=TRUE,
snRNA=TRUE,
transposable_element=FALSE,
tRNA=TRUE
))
},
equcab2 = {
return(list(
miRNA=FALSE,
misc_RNA=TRUE,
protein_coding=FALSE,
pseudogene=FALSE,
processed_pseudogene=FALSE,
rRNA=TRUE,
snoRNA=TRUE,
snRNA=TRUE
))
}
)
}
#' Return several analysis options given an analysis preset
#'
#' This is a helper function which returns a set of metaseqr pipeline options,
#' grouped together according to a preset keyword. It is intended mostly for
#' internal use.
#'
#' @param preset preset can be one of \code{"all.basic"}, \code{"all.normal"},
#' \code{"all.full"}, \code{"medium.basic"}, \code{"medium.normal"},
#' @param org one of the supported organisms. See \code{\link{metaseqr}} main
#' help page.
#' \code{"medium.full"}, \code{"strict.basic"}, \code{"strict.normal"} or
#' \code{"strict.full"}, each of which control the strictness of the analysis and
#' the amount of data to be exported. For an explanation of the presets, see the
#' main \code{\link{metaseqr}} help page.
#' @return A named list with names \code{exon.filters}, \code{gene.filters},
#' \code{pcut}, \code{export.what}, \code{export.scale}, \code{export.values} and
#' \code{export.stats}, each of which correspond to an element of the metaseqr
#' pipeline.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' strict.preset <- get.preset.opts("strict.basic","mm9")
#'}
get.preset.opts <- function(preset,org) {
# Override filter rules and maybe norm.args and stat.args
switch(preset,
all.basic = {
exon.filters <- NULL
gene.filters <- NULL
pcut <- NA
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change")
export.scale <- c("natural","log2")
export.values <- c("normalized")
export.stats <- c("mean")
},
all.normal = {
exon.filters <- NULL
gene.filters <- NULL
pcut <- NA
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change","stats",
"counts")
export.scale <- c("natural","log2")
export.values <- c("normalized")
export.stats <- c("mean","sd","cv")
},
all.full = {
exon.filters <- NULL
gene.filters <- NULL
pcut <- NA
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change","stats",
"counts","flags")
export.scale <- c("natural","log2","log10","vst")
export.values <- c("raw","normalized")
export.stats <- c("mean","median","sd","mad","cv","rcv")
},
medium.basic = {
exon.filters <- list(
min.active.exons=list(
exons.per.gene=5,
min.exons=2,
frac=1/5
)
)
gene.filters <- list(
length=list(
length=500
),
avg.reads=list(
average.per.bp=100,
quantile=0.25
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.defaults("biotype.filter",org[1])
)
pcut <- 0.05
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change")
export.scale <- c("natural","log2")
export.values <- c("normalized")
export.stats <- c("mean")
},
medium.normal = {
exon.filters <- list(
min.active.exons=list(
exons.per.gene=5,
min.exons=2,
frac=1/5
)
)
gene.filters <- list(
length=list(
length=500
),
avg.reads=list(
average.per.bp=100,
quantile=0.25
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.defaults("biotype.filter",org[1])
)
pcut <- 0.05
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change","stats",
"counts")
export.scale <- c("natural","log2")
export.values <- c("normalized")
export.stats <- c("mean","sd","cv")
},
medium.full = {
exon.filters <- list(
min.active.exons=list(
exons.per.gene=5,
min.exons=2,
frac=1/5
)
)
gene.filters <- list(
length=list(
length=500
),
avg.reads=list(
average.per.bp=100,
quantile=0.25
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.defaults("biotype.filter",org[1])
)
pcut <- 0.05
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change","stats",
"counts","flags")
export.scale <- c("natural","log2","log10","vst")
export.values <- c("raw","normalized")
export.stats <- c("mean","median","sd","mad","cv","rcv")
},
strict.basic = {
exon.filters=list(
min.active.exons=list(
exons.per.gene=4,
min.exons=2,
frac=1/4
)
)
gene.filters=list(
length=list(
length=750
),
avg.reads=list(
average.per.bp=100,
quantile=0.5
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.strict.biofilter(org[1])
)
pcut <- 0.01
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change")
export.scale <- c("natural","log2")
export.values <- c("normalized")
export.stats <- c("mean")
},
strict.normal = {
exon.filters=list(
min.active.exons=list(
exons.per.gene=4,
min.exons=2,
frac=1/4
)
)
gene.filters=list(
length=list(
length=750
),
avg.reads=list(
average.per.bp=100,
quantile=0.5
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.strict.biofilter(org[1])
)
pcut <- 0.01
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change","stats",
"counts")
export.scale <- c("natural","log2")
export.values <- c("normalized")
export.stats <- c("mean","sd","cv")
},
strict.full = {
exon.filters=list(
min.active.exons=list(
exons.per.gene=4,
min.exons=2,
frac=1/4
)
)
gene.filters=list(
length=list(
length=750
),
avg.reads=list(
average.per.bp=100,
quantile=0.5
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.strict.biofilter(org[1])
)
pcut <- 0.01
export.what <- c("annotation","p.value","adj.p.value",
"meta.p.value","adj.meta.p.value","fold.change","stats",
"counts","flags")
export.scale <- c("natural","log2","log10","vst")
export.values <- c("raw","normalized")
export.stats <- c("mean","median","sd","mad","cv","rcv")
}
)
preset.opts <- list(
exon.filters=exon.filters,
gene.filters=gene.filters,
pcut=pcut,
export.what=export.what,
export.scale=export.scale,
export.values=export.values,
export.stats=export.stats
)
return(preset.opts)
}
#' Calculates fold changes
#'
#' Returns a matrix of fold changes based on the requested contrast, the list of
#' all samples and the data matrix which is produced by the metaseqr workflow. For
#' details on the \code{contrast}, \code{sample.list} and \code{log.offset}
#' parameters, see the main usage page of metaseqr. This function is intended
#' mostly for internal use but can also be used independently.
#'
#' @param contrast the vector of requested statistical comparison contrasts.
#' @param sample.list the list containing condition names and the samples under
#' each condition.
#' @param data.matrix a matrix of gene expression data whose column names are the
#' same as the sample names included in the sample list.
#' @param log.offset a number to be added to each element of data matrix in order
#' to avoid Infinity on log type data transformations.
#' @return A matrix of fold change ratios, treatment to control, as these are
#' parsed from contrast.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' fc <- make.fold.change("Control_vs_Treatment",list(Control=c("C1","C2"),
#' Treatment=c("T1","T2")),data.matrix)
#'}
make.fold.change <- function(contrast,sample.list,data.matrix,log.offset=1) {
conds <- strsplit(contrast,"_vs_")[[1]]
#f (!is.matrix(data.matrix) || !is.data.frame(data.matrix)) { # Vector, nrow=1
# nn <- names(data.matrix)
# data.matrix <- t(as.matrix(data.matrix))
# colnames(data.matrix) <- nn
#}
fold.mat <- matrix(0,nrow(data.matrix),length(conds)-1)
for (i in 2:length(conds)) { # First condition is ALWAYS reference
samples.nom <- sample.list[[conds[i]]]
samples.denom <- sample.list[[conds[1]]]
nom <- data.matrix[,match(samples.nom,colnames(data.matrix)),drop=FALSE]
denom <- data.matrix[,match(samples.denom,colnames(data.matrix)),
drop=FALSE]
if (!is.matrix(nom)) nom <- as.matrix(nom) # Cover the case with no replicates...
if (!is.matrix(denom)) denom <- as.matrix(denom)
mean.nom <- apply(nom,1,mean)
mean.denom <- apply(denom,1,mean)
#mean.nom <- ifelse(mean.nom==0,log.offset,mean.nom)
if (any(mean.nom==0)) mean.nom <- mean.nom + log.offset
#mean.denom <- ifelse(mean.denom==0,log.offset,mean.denom)
if (any(mean.denom==0)) mean.denom <- mean.denom + log.offset
fold.mat[,i-1] <- mean.nom/mean.denom
}
rownames(fold.mat) <- rownames(data.matrix)
colnames(fold.mat) <- paste(conds[1],"_vs_",conds[2:length(conds)],sep="")
return(fold.mat)
}
#' Calculates average expression for an MA plot
#'
#' Returns a matrix of average expressions (A in MA plot) based on the requested
#' contrast, the list of all samples and the data matrix which is produced by
#' the metaseqr workflow. For details on the \code{contrast}, \code{sample.list}
#' and \code{log.offset} parameters, see the main usage page of metaseqr.
#' This function is intended mostly for internal use but can also be used
#' independently.
#'
#' @param contrast the vector of requested statistical comparison contrasts.
#' @param sample.list the list containing condition names and the samples under
#' each condition.
#' @param data.matrix a matrix of gene expression data whose column names are the
#' same as the sample names included in the sample list.
#' @param log.offset a number to be added to each element of data matrix in order
#' to avoid Infinity on log type data transformations.
#' @return A matrix of fold change ratios, treatment to control, as these are
#' parsed from contrast.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' a <- make.avg.expression("Control_vs_Treatment",list(Control=c("C1","C2"),
#' Treatment=c("T1","T2")),data.matrix)
#'}
make.avg.expression <- function(contrast,sample.list,data.matrix,log.offset=1) {
conds <- strsplit(contrast,"_vs_")[[1]]
a.mat <- matrix(0,nrow(data.matrix),length(conds)-1)
for (i in 2:length(conds)) { # First condition is ALWAYS reference
samples.nom <- sample.list[[conds[i]]]
samples.denom <- sample.list[[conds[1]]]
nom <- data.matrix[,match(samples.nom,colnames(data.matrix))]
denom <- data.matrix[,match(samples.denom,colnames(data.matrix))]
if (!is.matrix(nom)) nom <- as.matrix(nom) # Cover the case with no replicates...
if (!is.matrix(denom)) denom <- as.matrix(denom)
mean.nom <- apply(nom,1,mean)
mean.denom <- apply(denom,1,mean)
if (any(mean.nom==0)) mean.nom <- mean.nom + log.offset
if (any(mean.denom==0)) mean.denom <- mean.denom + log.offset
a.mat[,i-1] <- 0.5*(log2(mean.nom)+log2(mean.denom))
}
rownames(a.mat) <- rownames(data.matrix)
colnames(a.mat) <- paste(conds[1],"_vs_",conds[2:length(conds)],sep="")
return(a.mat)
}
#' HTML report helper
#'
#' Returns a character matrix with html formatted table cells. Essentially, it
#' converts the input data to text and places them in a <td></td> tag set.
#' Internal use.
#'
#' @param mat the data matrix (numeric or character)
#' @param type the type of data in the matrix (\code{"numeric"} or
#' \code{"character"}).
#' @param digits the number of digits on the right of the decimal points to pass
#' to \code{\link{formatC}}. It has meaning when \code{type="numeric"}.
#' @return A character matrix with html formatted cells.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' the.cells <- make.html.cells(data.matrix)
#'}
make.html.cells <- function(mat,type="numeric",digits=3) {
if (type=="numeric")
tmp <- format(mat,digits=digits)
#tmp <- formatC(mat,digits=digits,format="f")
else
tmp <- mat
if (!is.matrix(tmp)) tmp <- as.matrix(tmp)
tmp <- apply(tmp,c(1,2),function(x) paste("<td>",x,"</td>",sep=""))
return(tmp)
}
#' HTML report helper
#'
#' Returns a character vector with html formatted rows. Essentially, it collapses
#' every row of a matrix to a single character and puts a <tr></tr> tag set around.
#' It is meant to be applied to the output of \code{\link{make.html.cells}}.
#' Internal use.
#'
#' @param mat the data matrix, usually the output of \code{\link{make.html.cells}}
#' function.
#' @return A character vector with html formatted rows of a matrix.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' the.cells <- make.html.cells(data.matrix)
#' the.rows <- make.html.rows(the.cells)
#'}
make.html.rows <- function(mat) {
tmp <- apply(mat,1,paste,collapse="")
tmp <- paste("<tr>",tmp,"</tr>",sep="")
return(tmp)
}
#' HTML report helper
#'
#' Returns a character vector with an html formatted table head row. Essentially,
#' it collapses the input row to a single character and puts a <th></th> tag set
#' around. It is meant to be applied to the output of \code{\link{make.html.cells}}.
#' Internal use.
#'
#' @param h the colnames of a matrix or data frame, usually as output of
#' \code{\link{make.html.cells}} function.
#' @return A character vector with html formatted header of a matrix.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' the.cells <- make.html.cells(data.matrix)
#' the.header <- make.html.header(the.cells[1,])
#'}
make.html.header <- function(h) {
tmp <- paste("<th>",h,"</th>",sep="")
tmp <- paste(tmp,collapse="")
tmp <- paste("<tr>",tmp,"</tr>",sep="")
return(tmp)
}
#' HTML report helper
#'
#' Returns a character vector with an html formatted table. Essentially, it
#' collapses the input rows to a single character and puts a <tbody></tbody>
#' tag set around. It is meant to be applied to the output of
#' \code{\link{make.html.rows}}. Internal use.
#'
#' @param mat the character vector produced by \code{\link{make.html.rows}}.
#' @return A character vector with the body of mat formatted in html.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' the.cells <- make.html.cells(data.matrix)
#' the.header <- make.html.header(the.cells[1,])
#' the.rows <- make.html.rows(the.cells)
#' the.body <- make.html.body(the.rows)
#'}
make.html.body <- function(mat) {
tmp <- paste(mat,collapse="")
return(tmp)
}
#' HTML report helper
#'
#' Returns a character vector with a fully html formatted table. Essentially, it
#' binds the outputs of \code{\link{make.html.cells}}, \code{\link{make.html.rows}},
#' \code{\link{make.html.header}} and \code{\link{make.html.body}} to the final
#' table and optionally assigns an id attribute. The above functions are meant to
#' format a data table so as it can be rendered by external tools such as
#' DataTables.js during a report creation. It is meant for internal use.
#'
#' @param b the table body as produced by \code{\link{make.html.body}}.
#' @param h the table header as produced by \code{\link{make.html.header}}.
#' @param id the table id attribute.
#' @return A fully formatted html table.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' the.cells <- make.html.cells(data.matrix)
#' the.header <- make.html.header(the.cells[1,])
#' the.rows <- make.html.rows(the.cells)
#' the.body <- make.html.body(the.rows)
#' the.table <- make.html.table(the.body,the.header,id="my_table")
#'}
make.html.table <- function(b,h=NULL,id=NULL) {
if (!is.null(id))
html <- paste("<table id=\"",id,"\" class=\"datatable\">",sep="")
else
html <- "<table class=\"datatable\">"
if (!is.null(h))
html <- paste(html,"<thead>",h,"</thead>",sep="")
html <- paste(html,"<tbody>",b,"</tbody></table>",sep="")
return(html)
}
#' Calculates several transformation of counts
#'
#' Returns a list of transformed (normalized) counts, based on the input count
#' matrix data.matrix. The data transformations are passed from the
#' \code{export.scale} parameter and the output list is named accordingly. This
#' function is intended mostly for internal use but can also be used independently.
#'
#' @param data.matrix the raw or normalized counts matrix. Each column represents
#' one input sample.
#' @param export.scale a character vector containing one of the supported data
#' transformations (\code{"natural"}, \code{"log2"}, \code{"log10"},\code{"vst"}).
#' See also the main help page of metaseqr.
#' @param scf a scaling factor for the reads of each gene, for example the sum of
#' exon lengths or the gene length. Devided by each read count when
#' \code{export.scale="rpgm"}. It provides an RPKM-like measure but not the
#' actual RPKM as this normalization is not supported.
#' @param log.offset a number to be added to each element of data.matrix in order
#' to avoid Infinity on log type data transformations.
#' @return A named list whose names are the elements in export.scale. Each list
#' member is the respective transformed data matrix.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' tr <- make.transformation(data.matrix,c("log2","vst"))
#' head(tr$vst)
#'}
make.transformation <- function(data.matrix,export.scale,
scf=NULL,log.offset=1) {
mat <- vector("list",length(export.scale))
names(mat) <- export.scale
if (!is.matrix(data.matrix)) data.matrix <- as.matrix(data.matrix)
if (is.null(scf)) scf <- rep(1,nrow(data.matrix))
for (scl in export.scale) {
switch(scl,
natural = {
mat[[scl]] <- data.matrix
},
log2 = {
mat[[scl]] <- nat2log(data.matrix,base=2,off=log.offset)
},
log10 = {
mat[[scl]] <- nat2log(data.matrix,base=10,off=log.offset)
},
vst = {
fit <- vsn2(data.matrix,verbose=FALSE)
mat[[scl]] <- predict(fit,newdata=data.matrix)
},
rpgm = {
mat[[scl]] <- data.matrix
for (i in 1:ncol(data.matrix))
mat[[scl]][,i] <- data.matrix[,i]/scf
}
)
}
return(mat)
}
#' Calculates several statistices on read counts
#'
#' Returns a matrix of statistics calculated for a set of given samples. Internal
#' use.
#'
#' @param samples a set of samples from the dataset under processing. They should
#' match sample names from \code{sample.list}. See also the main help page of
#' \code{\link{metaseqr}}.
#' @param data.list a list containing natural or transformed data, typically an
#' output from \code{\link{make.transformation}}.
#' @param stat the statistics to calculate. Can be one or more of \code{"mean"},
#' \code{"median"}, \code{"sd"}, \code{"mad"}, \code{"cv"}, \code{"rcv"}. See also
#' the main help page of \code{\link{metaseqr}}.
#' @param export.scale the output transformations used as input also to
#' \code{\link{make.transformation}}.
#' @return A matrix of statistics calculated based on the input sample names. The
#' different data transformnations are appended columnwise.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' tr <- make.transformation(data.matrix,c("log2","vst"))
#' st <- make.stat(c("C1","C2"),tr,c("mean","sd"),c("log2","vst"))
#'}
make.stat <- function(samples,data.list,stat,export.scale) {
stat.result <- vector("list",length(export.scale))
names(stat.result) <- export.scale
for (scl in export.scale) {
stat.data <- data.list[[scl]][,match(samples,
colnames(data.list[[scl]]))]
if (!is.matrix(stat.data)) stat.data <- as.matrix(stat.data)
switch(stat,
mean = {
stat.result[[scl]] <- apply(stat.data,1,function(x,s) {
if (s=="natural")
return(round(mean(x)))
else
return(mean(x))
},scl)
},
median = {
stat.result[[scl]] <- apply(stat.data,1,function(x,s) {
if (s=="natural")
return(round(median(x)))
else
return(median(x))
},scl)
},
sd = {
stat.result[[scl]] <- apply(stat.data,1,function(x,s) {
if (s=="natural")
return(ceiling(sd(x)))
else
return(sd(x))
},scl)
},
mad = {
stat.result[[scl]] <- apply(stat.data,1,function(x,s) {
if (s=="natural")
return(ceiling(mad(x)))
else
return(mad(x))
},scl)
},
cv = {
stat.result[[scl]] <- apply(stat.data,1,function(x,s) {
if (s=="natural")
return(ceiling(sd(x))/round(mean(x)))
else
return(sd(x)/mean(x))
},scl)
},
rcv = {
stat.result[[scl]] <- apply(stat.data,1,function(x,s) {
if (s=="natural")
return(ceiling(mad(x))/round(median(x)))
else
return(mad(x)/median(x))
},scl)
}
)
}
return(do.call("cbind",stat.result))
}
#' Results output build helper
#'
#' Returns a list of matrices based on the export scales that have been chosen
#' from the main function and a subset of samples based on the sample names
#' provided in the \code{sample.list} argument of the main \code{\link{metaseqr}}
#' function. Internal use.
#'
#' @param samples a set of samples from the dataset under processing. They should
#' match sample names from \code{sample.list}. See also the main help page of
#' \code{\link{metaseqr}}.
#' @param data.list a list containing natural or transformed data, typically an
#' output from \code{\link{make.transformation}}.
#' @param export.scale the output transformations used as input also to
#' \code{\link{make.transformation}}.
#' @return A named list whose names are the elements in \code{export.scale}.
#' Each list member is the respective sample subest data matrix.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data.matrix <- round(1000*matrix(runif(400),100,4))
#' rownames(data.matrix) <- paste("gene_",1:100,sep="")
#' colnames(data.matrix) <- c("C1","C2","T1","T2")
#' tr <- make.transformation(data.matrix,c("log2","vst"))
#' mm <- make.matrix(c("C1","T1"),tr,"log2")
#' head(tr$vst)
#'}
make.matrix <- function(samples,data.list,export.scale="natural") {
mat <- vector("list",length(export.scale))
names(mat) <- export.scale
for (scl in export.scale) {
mat.data <- data.list[[scl]][,match(samples,
colnames(data.list[[scl]]))]
if (!is.matrix(mat.data)) {
mat.data <- as.matrix(mat.data)
colnames(mat.data) <- samples
}
mat[[scl]] <- mat.data
}
return(do.call("cbind",mat))
}
#' Create contrast lists from contrast vectors
#'
#' Returns a list, properly structured to be used within the \code{stat.*}
#' functions of the metaseqr package. See the main documentation for the structure
#' of this list and the example below. This function is mostly for internal use,
#' as the \code{stat.*} functions can be supplied directly with the contrasts
#' vector which is one of the main \code{\link{metaseqr}} arguments.
#'
#' @param contrast a vector of contrasts in the form "ConditionA_vs_ConditionB"
#' or "ConditionA_
#' vs_ConditionB_vs_ConditionC_vs_...".
#' In case of Control vs Treatment designs, the Control condition should ALWAYS
#' be the first.
#' @param sample.list the list of samples in the experiment. See also the main
#' help page of \code{\link{metaseqr}}.
#' @return A named list whose names are the contrasts and its members are named
#' vectors, where the names are the sample names and the
#' actual vector members are the condition names. See the example.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' sample.list <- list(Control=c("C1","C2"),TreatmentA=c("TA1","TA2"),
#' TreatmentB=c("TB1","TB2"))
#' contrast <- c("Control_vs_TreatmentA","Control_vs_TreatmentA_vs_TreatmentB"),
#' cl <- make.contrast.list(contrast,sample.list)
#' cl
#'}
make.contrast.list <- function(contrast,sample.list) {
# Construction
contrast.list <- vector("list",length(contrast))
names(contrast.list) <- contrast
# First break the contrast vector
cnts <- strsplit(contrast,"_vs_")
names(cnts) <- names(contrast.list)
# Create list members
for (n in names(contrast.list)) {
contrast.list[[n]] <- vector("list",length(cnts[[n]]))
for (i in 1:length(cnts[[n]])) {
contrast.list[[n]][[i]] <- rep(cnts[[n]][i],
length(sample.list[[cnts[[n]][i]]]))
names(contrast.list[[n]][[i]]) <- sample.list[[cnts[[n]][[i]]]]
}
}
return(contrast.list)
}
#' Creates sample list from file
#'
#' Create the main sample list from an external file.
#'
#' @param input a tab-delimited file structured as follows: the first line of the
#' external tab delimited file should contain column names (names are not important).
#' The first column MUST contain UNIQUE sample names and the second column MUST
#' contain the biological condition where each of the samples in the first column
#' should belong to.
#' @param type one of \code{"simple"} or \code{"targets"} to indicate if the input
#' is a simple two column text file or the targets file used to launch the main
#' analysis pipeline.
#' @return A named list whose names are the conditions of the experiments and its
#' members are the samples belonging to each condition.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' targets <- data.frame(sample=c("C1","C2","T1","T2"),
#' condition=c("Control","Control","Treatment","Treatment"))
#' write.table(targets,file="targets.txt",sep="\t",row.names=FALSE,quote="")
#' sample.list <- make.sample.list("targets.txt")
#'}
make.sample.list <- function(input,type=c("simple","targets")) {
if (missing(input) || !file.exists(input))
stopwrap("File to make sample list from should be a valid existing ",
"text file!")
type <- tolower(type[1])
check.text.args("type",type,c("simple","targets"),multiarg=FALSE)
tab <- read.delim(input)
samples <- as.character(tab[,1])
if (type=="simple") {
ii <- 2
conditions <- unique(as.character(tab[,ii]))
}
else if (type=="targets") {
ii <- 3
conditions <- unique(as.character(tab[,ii]))
}
if (length(samples) != length(unique(samples)))
stopwrap("Sample names must be unique for each sample!")
sample.list <- vector("list",length(conditions))
names(sample.list) <- conditions
for (n in conditions)
sample.list[[n]] <- samples[which(as.character(tab[,ii])==n)]
return(sample.list)
}
#' Project path constructor
#'
#' Create the main metaseqr project path. Internal use only.
#'
#' @param path The desired project path. Can be NULL for auto-generation.
#' @param f The input counts table file.
#' @return A list with project path elements.
#' @author Panagiotis Moulos
make.project.path <- function(path,f=NULL) {
if (is.na(path) || is.null(path)) {
if (!is.data.frame(f) && !is.null(f) && !is.list(f) && file.exists(f)) {
if (length(grep(".RData$",f))>0)
main.path <- file.path(getwd(),paste("metaseqr_result_",
format(Sys.time(),format="%Y%m%d%H%M%S"),sep=""))
else
main.path <- file.path(dirname(f),paste("metaseqr_result_",
format(Sys.time(),format="%Y%m%d%H%M%S"),sep=""))
}
else
main.path <- file.path(getwd(),paste("metaseqr_result_",
format(Sys.time(),format="%Y%m%d%H%M%S"),sep=""))
project.path <- make.path.struct(main.path)
}
else {
success <- tryCatch(
if (!file.exists(path)) dir.create(path,recursive=TRUE) else TRUE,
error=function(e) {
disp("Cannot create ",path,"! Is it a valid system path? Is ",
"there a write permissions problem? Reverting to ",
"automatic creation...")
return(FALSE)
},
finally=""
)
if (success)
project.path <- make.path.struct(path)
else
project.path <- make.project.path(NA,f)
}
return(project.path)
}
#' Project path constructor helper
#'
#' Helper for \code{make.project.path}. Internal use only.
#'
#' @param main.path The desired project path.
#' @return A named list whose names are the conditions of the experiments and its
#' members are the samples belonging to each condition.
#' @author Panagiotis Moulos
make.path.struct <- function(main.path) {
project.path <- list(
main=main.path,
media=file.path(main.path,"media"),
data=file.path(main.path,"data"),
logs=file.path(main.path,"logs"),
lists=file.path(main.path,"lists"),
plots=file.path(main.path,"plots"),
qc=file.path(main.path,"plots","qc"),
normalization=file.path(main.path,"plots","normalization"),
statistics=file.path(main.path,"plots","statistics")
)
for (p in names(project.path))
if (!file.exists(project.path[[p]]))
dir.create(project.path[[p]],recursive=TRUE)
return(project.path)
}
#' Intitialize output list
#'
#' Initializes metaseqr R output. Internal use only.
#'
#' @param con The contrasts.
#' @return An empty named list.
#' @author Panagiotis Moulos
make.export.list <- function(con) {
f <- vector("list",length(con))
names(f) <- con
return(f)
}
#' Optimize rectangular grid plots
#'
#' Returns a vector for an optimized m x m plot grid to be used with e.g.
#' \code{par(mfrow)}. m x m is as close as possible to the input n. Of course,
#' there will be empty grid positions if n < m x m.
#'
#' @param n An integer, denoting the total number of plots to be created.
#' @return A 2-element vector with the dimensions of the grid.
#' @author Panagiotis Moulos
#' @export
#' @examples
#' \dontrun{
#' g1 <- make.grid(16) # Returns c(4,4)
#' g2 <- make.grid(11) # Returns c(4,3)
#'}
make.grid <- function(n) {
m <- 0
while (n > m*m)
m <- m+1
if (n < m*m) {
k <- m-1
if (n > m*k)
k <- k+1
else {
while (n > m*k)
k=k-1
}
}
else
k <- m
return(c(m,k))
}
#' Initializer of report messages
#'
#' Initializes metaseqr report tmeplate messages output. Internal use only.
#'
#' @param lang The language of the report. For now, only english (\code{"en"})
#' is supported.
#' @return An named list with messages for each input option.
#' @author Panagiotis Moulos
make.report.messages <- function(lang) {
switch(lang,
en = {
messages <- list(
org=list(
hg18=paste("human (<em>Homo sapiens</em>),",
"genome version alias hg18"),
hg19=paste("human (<em>Homo sapiens</em>),",
"genome version alias hg19"),
mm9=paste("mouse (<em>Mus musculus</em>),",
"genome version alias mm9"),
mm10=paste("mouse (<em>Mus musculus</em>),",
"genome version alias mm10"),
rn5=paste("rat (<em>Rattus norvegicus</em>),",
"genome version alias rn5"),
dm3=paste("fruitfly (<em>Drosophila melanogaster</em>),",
"genome version alias dm3"),
danrer7=paste("zebrafish (<em>Danio rerio</em>),",
"genome version alias danRer7"),
pantro5=paste("chimpanzee (<em>Pan troglodytes</em>),",
"genome version alias panTro5"),
susscr3=paste("pig (<em>Sus scrofa</em>),",
"genome version alias susScr3"),
tair10=paste("arabidopsis (<em>Arabidobsis thaliana</em>)",
",","genome version alias TAIR10"),
bmori2=paste("silkworm (<em>Bombyx mori</em>)",
",","genome version alias bmori2")
),
refdb=list(
ensembl="Ensembl genomes",
ucsc="UCSC genomes database",
refseq="RefSeq database"
),
whenfilter=list(
prenorm="before normalization",
postnorm="after normalization"
),
norm=list(
edaseq="EDASeq",
deseq="DESeq",
edger="edgeR",
noiseq="NOISeq",
nbpseq="NBPSeq",
each="the same as the corresponding statistical test"
),
stat=list(
deseq="DESeq",
edger="edgeR",
noiseq="NOISeq",
bayseq="baySeq",
limma="limma",
nbpseq="NBPSeq"
),
meta=list(
intersection="intersection of individual results",
union="union of individual results",
fisher="Fisher's method",
fperm="Fisher's method with permutations",
dperm.min=paste("samples permutation based method with",
"minimum p-values"),
dperm.max=paste("samples permutation based method with",
"maximum p-values"),
dperm.weight=paste("samples permutation based method with",
"weighted p-values"),
minp="minimum p-value across results",
maxp="maximum p-value across results",
weight="PANDORA weighted p-value across results",
pandora="PANDORA weighted p-value across results",
simes="Simes correction and combination method",
whitlock=paste("Whitlock's Z-transformation method",
"(Bioconductor package survcomp)"),
none=paste("no meta-analysis, reported p-values from the",
"first supplied statistical algorithm")
),
adjust=list(
holm="Holm FWER",
hochberg="Hochberg DFR",
hommel="Hommel FWER",
bonferroni="Bonferroni FWER",
bh="Benjamini-Hochberg FDR",
by="Benjamini-Yekutiely FDR",
fdr="Benjamini-Hochberg FDR",
none="no multiple test correction",
qvalue="Storey-Tibshirani FDR"
),
plots=list(
mds="multidimensional scaling",
biodetection="biotype detection",
countsbio="biotype counts",
saturation="sample and biotype saturation",
rnacomp="RNA composition",
boxplot="boxplots",
gcbias="GC-content bias",
lengthbias="transcript length bias",
meandiff="mean-difference plot",
meanvar="mean-variance plot",
deheatmap="DEG heatmap",
volcano="volcano plot",
biodist="DEG biotype detection",
filtered="filtered biotypes",
correl="correlation heatmap and correlogram",
pairwise="pairwise scatterplots between samples",
venn="Venn diagrams"
),
export=list(
annotation="Annotation",
p.value="p-value",
adj.p.value="Adjusted p-value (FDR)",
fold.change="Fold change",
stats="Statistics",
counts="Read counts",
natural="Natural scale",
log2="log2 scale",
log10="log10 scale",
vst="Variance stabilization transformation",
rpgm="Reads per Gene Model",
raw="Raw values",
normalized="Normalized values",
mean="Mean",
median="Median",
sd="Standard deviation",
mad="Median Absolute Deviation (MAD)",
cv="Coefficient of Variation",
rcv="Robust Coefficient of Variation"
),
preset=list(
all.basic=paste("use all genes and export all genes and",
"basic annotation and statistics elements"),
all.normal=paste("use all genes and export all genes and",
"normal annotation and statistics elements"),
all.full=paste("use all genes and export all genes and all",
"available annotation and statistics elements"),
medium.basic=paste("apply a medium set of filters and",
"export statistically significant genes and basic",
"annotation and statistics elements"),
medium.normal=paste("apply a medium set of filters and",
"export statistically significant genes and normal",
"annotation and statistics elements"),
medium.full=paste("apply a medium set of filters and",
"export statistically significant genes and all",
"available annotation and statistics elements"),
strict.basic=paste("apply a strict set of filters and",
"export statistically significant genes and basic",
"annotation and statistics elements"),
strict.normal=paste("apply a medium set of filters and",
"export statistically significant genes and normal",
"annotation and statistics elements"),
strict.full=paste("apply a medium set of filters and",
"export statistically significant genes and all",
"available annotation and statistics elements")
),
explain=list(
mds=paste(
"The Multi-Dimensional Scaling (MDS) plots comprise a means",
"of visualizing the level of similarity of individual cases",
"of a dataset. It is similar to Principal Component Analysis",
"(PCA), but instead of using the covariance matrix to find",
"similarities among cases, MDS uses absolute distance metrics",
"such as the classical Euclidean distance. Because of the",
"relative linear relations among sequencing samples, it",
"provides a more realistic clustering among samples. MDS",
"serves quality control and it can be interpreted as follows:",
"when the distance among samples of the same biological",
"condition in the MDS space is small, this is an indication",
"of high correlation and reproducibility among them. When",
"this distance is larger or heterogeneous (e.g. the 3rd",
"sample of a triplicate set is further from the other 2),",
"this constitutes an indication of low correlation and",
"reproducibility among samples. It can help exclude poor",
"samples from further analysis.",collapse=" "
),
biodetection=paste(
"The biotype detection bar diagrams are a set of quality",
"control charts that show the percentage of each biotype",
"in the genome (i.e. in the whole set of features provided,",
"for example, protein coding genes, non coding RNAs or",
"pseudogenes) in grey bars, which proportion has been",
"detected in a sample before normalization and after a",
"basic filtering by removing features with zero counts in",
"red lined bars, and the percentage of each biotype within",
"the sample in solid red bars. The difference between grey",
"bars and solid red bars is that the grey bars show the",
"percentage of a feature in the genome while the solid red",
"bars show the percentage in the sample. Thus, the solid",
"red bars may be sometimes higher than the grey bars because",
"certain features (e.g. protein coding genes) may be",
"detected within a sample with a higher proportion",
"relatively to their presence in the genome, as compared",
"with other features. For example, while the percentage",
"of protein coding genes in the whole genome is already",
"higher than other biotypes, this percentage is expected",
"to be even higher in an RNA-Seq experiment where one",
"expects protein-coding genes to exhibit greater abundance.",
"The vertical green line separates the most abundant",
"biotypes (on the left-hand side, corresponding to the",
"left axis scale) from the rest (on the right-hand side,",
"corresponding to the right axis scale). Otherwise, the",
"lower abundance biotypes would be indistinguishable.",
"Unexpected outcomes in this quality control chart (e.g.",
"very low detection of protein coding genes) would signify",
"possible low quality of a sample.",collapse=" "
),
countsbio=paste(
"The biotype detection counts boxplots are a set of quality",
"control charts that depict both the biological classification",
"for the detected features and the actual distribution of",
"the read counts for each biological type. The boxplot",
"comprises a means of summarizing the read counts distribution",
"of a sample in the form of a bar with extending lines,",
"as commonly used way of graphically presenting groups of",
"numerical data. A boxplot also indicates which observations,",
"if any, might be considered outliers and is able to visually",
"show different types of populations, without making any",
"assumptions of the underlying statistical distribution.",
"The spacing between the different parts of the box help",
"indicate variance, skewness and identify outliers. The",
"thick bar inside the colored box is the median of the",
"observations while the box extends over the Interquartile",
"Range of the observations. The whiskers extend up (down)",
"to +/-1.5xIQR. Unexpected outcomes (e.g. protein coding",
"read count distribution similar to pseudogene read count",
"distribution) indicates poor sample quality.",collapse=" "
),
saturation=paste(
"The read and biotype saturation plots are a set of quality",
"control charts that depict the read count saturation",
"levels at several sequencing depths. Thus, they comprise",
"a means of assessing whether the sequencing depth of an",
"RNA-Seq experiment is sufficient in order to detect the",
"biological features under investigation. These quality",
"control charts are separated in two subgroups: the first",
"subgroup (read saturation per biotype for all samples)",
"is a set of plots, one for each biological feature (e.g.",
"protein coding, pseudogene, lincRNA, etc.), that depict",
"the number of detected features in different sequencing",
"depths and for all samples in the same plot. The second",
"subgroup (read saturation per sample for all biotypes)",
"is a set of plots similar to the above, but with,",
"there is one pair of plots with two panels for each sample,",
"presenting all biological features. The left panel depicts",
"the saturation levels for the less abundatnt features,",
"while the right panel, the saturation for the more abundant",
"features, as placing them all together would make the",
"less abundant features indistinguishable. All the saturation",
"plots should be interpreted as follows: if the read counts",
"for a biotype tend to be saturated, the respective curve",
"should tend to reach a plateau at higher depths. Otherwise,",
"more sequencing is needed for the specific biotype.",
collapse=" "
),
readnoise=paste(
"The read noise plots depict the percentage of biological",
"features detected when subsampling the total number of",
"reads. Very steep curves in read noise plots indicate",
"that although the sequencing depth reaches its maximum,",
"a relatively small percentage of total features is detected,",
"indicating that the level of background noise is relatively",
"high. Less steep RNA composition curves, indicate less noise.",
"When a sample's curve deviate from the rest, it could",
"indicate lower or higher quality, depending on the curves",
"of the rest of the samples.",collapse=" "
),
correl=paste(
"The sample correlation plots depict the accordance among",
"the RNA-Seq samples, as this is manifested through the",
"read counts table used with the metaseqr pipeline, with",
"two representations that both use the correlation matrix",
"(a matrix which depicts all the pairwise correlations",
"between each pair of samples) of the read counts matrix.",
"The first is a correlation clustered heatmap which",
"depicts the correlations among samples as color-scaled",
"image and the hierarchical clustering tree depicts the",
"grouping of the samples according to their correlation.",
"Samples from the same group that are not clustered together",
"provides an indication that there might be a quality",
"problem with the dataset. The second is a 'correlogram'",
"plot, where again the samples are hierarchically clustered",
"and grouped but this time correlations are presented as",
"ellipses inside each cell. Each cell represents a pairwise",
"comparison and each correlation coefficient is represented",
"by an ellipse whose 'diameter', direction and color",
"depict the accordance for that pair of samples. Highly",
"correlated samples are depicted as ellipses with narrow",
"diameter, while poorly correlated samples are depicted",
"as ellipses with wide diameters. Also, highly correlated",
"samples are depicted as ellipses with a left-to-right",
"upwards direction while poorly correlated samples are",
"depicted as ellipses with a right-to-left upwards direction.",
collapse=" "
),
pairwise=paste(
"The pairwise comparison plots are split in three parts:",
"the upper diagonal consists of simple scatterplots for",
"all pairwise sample comparisons, together with their",
"Pearson correlation coefficient. It is a simple measure",
"of between sample correlation using all the available",
"data points instead of only the correlation matrix. The",
"lower diagonal consists of mean-difference plots for all",
"pairwise sample comparisons. A mean-difference plot (or",
"a Bland-Altman plots) is a method of data plotting used",
"in analyzing the agreement between two different",
"assays/variables. In this graphical method the differences",
"(or alternatively the ratios) between the two variables",
"are plotted against the averages of the two. Such a plot",
"is useful, for example, for analyzing data with strong",
"correlation between x and y axes, when the (x,y) dots on",
"the plot are close to the diagonal x=y. In this case, the",
"value of the transformed variable X is about the same as",
"x and y and the variable Y shows the difference between",
"x and y. In both represantations, irregular shapes of the",
"red smoother lines are an indication of poor correlation",
"between samples or of other systematic bias sources,",
"which is usually corrected through data normalization.",
collapse=" "
),
rnacomp=paste(
"The RNA composition plots depict the differences in the",
"distributions of reads in the same biological features",
"across samples. The following is taken from the NOISeq",
"vignette: <em>'...when two samples have different RNA",
"composition, the distribution of sequencing reads across",
"the features is different in such a way that although",
"a feature had the same number of read counts in both",
"samples, it would not mean that it was equally expressed",
"in both... To check if this bias is present in the data,",
"the RNA composition plot and the correponding diagnostic",
"test can be used. In this case, each sample s is compared",
"to the reference sample r (which can be arbitrarily",
"chosen). To do that, M values are computed as",
"log2(counts_sample = counts_reference). If no bias is",
"present, it should be expected that the median of M",
"values for each comparison is 0. Otherwise, it would be",
"indicating that expression levels in one of the samples",
"tend to be higher than in the other, and this could lead",
"to false discoveries when computing differencial expression.",
"Confidence intervals for the M median are also computed by",
"bootstrapping. If value 0 does not fall inside the interval,",
"it means that the deviation of the sample with regard",
"to the reference sample is statistically significant.",
"Therefore, a normalization procedure is required.'</em>",
collapse=" "
),
boxplot=paste(
"The boxplot comprises a means of summarizing the read",
"counts distribution of a sample in the form of a bar",
"with extending lines, as a commonly used way of",
"graphically presenting groups of numerical data. A",
"boxplot also indicates which observations, if any, might",
"be considered outliers and is able to visually show",
"different types of populations, without making any",
"assumptions about the underlying statistical distribution.",
"The spacings between the different parts of the box help",
"indicate variance, skewness and identify outliers. The",
"thick bar inside the colored box is the median of the",
"observations while the box extends over the Interquartile",
"Range of the observations. The whiskers extend up (down)",
"to +/-1.5xIQR. Boxplots at similar levels indicate good",
"quality of the normalization. If boxplots remain at",
"different levels after normalization, maybe another",
"normalization algorithm may have to be examined.",
"The un-normalized boxplots show the need for data",
"normalization in order for the data from different",
"samples to follow the same underlying distribution and",
"statistical testing becoming possible.",collapse=" "
),
gcbias=paste(
"The GC-content bias plot is a quality control chart that",
"shows the possible dependence of the read counts (in log2",
"scale) under a gene to the GC content percentage of that",
"gene. In order for the statistical tests to be able to",
"detect statistical significance which occurs due to real",
"biological effects and not by other systematic biases",
"present in the data (e.g. a possible GC-content bias),",
"the latter should be accounted for by the applied",
"normalization algorithm. Although the tests are performed",
"for each gene across biological conditions one could assume",
"that the GC content does not represent a bias, as it is the",
"same for the tested gene across samples and conditions.",
"However, Risso et al. (2011) showed that the GC-content",
"could have an impact in the statistical testing procedure.",
"The GC-content bias plot depicts the dependence of the",
"read counts to the GC content before and after normalization.",
"The smoothing lines for each sample, should be as 'straight'",
"as possible after normalization. In addition, if the",
"smoothing lines differ significantly between biological",
"conditions, this would constitute a possible quality warning.",
collapse=" "
),
lengthbias=paste(
"The gene/transcript length bias plot is a quality control",
"chart that shows the possible dependence of the read counts",
"(in log2 scale) under a gene to the length that gene (whole",
"gene or sum of exons depending on the analysis). In order",
"for the statistical tests to be able to detect statistical",
"significance which occurs due to real biological effects",
"and not by other systematic biases present in the data",
"(e.g. a possible length bias), the latter should be accounted",
"for by the applied normalization algorithm. Although the",
"tests are performed for each gene across bioogical conditions,",
"one could assume that the gene length does not represent",
"a bias as it's the same for the tested gene across samples",
"and conditions. However, it has been shown in several",
"studies that the gene length could have an impact on the",
"statistical testing procedure. The length bias plot",
"depicts the dependence of the read counts to the",
"gene/transcript length before and after normalization.",
"The smoothing lines for each sample, should be as 'straight'",
"as possible after normalization. In addition, if the",
"smoothing lines differ significantly among biological",
"conditions, this would constitute a possible quality warning.",
collapse=" "
),
meandiff=paste(
"A mean-difference plot (or a Bland-Altman plot) is a",
"method of data plotting used in analyzing the agreement",
"between two different assays/variables. In this graphical",
"method the differences (or alternatively the ratios)",
"between the two variables are plotted against the averages",
"of the two. Such a plot is useful, for example, for analyzing",
"data with strong correlation between x and y axes, when",
"the (x,y) dots on the plot are close to the diagonal x=y.",
"In this case, the value of the transformed variable X is",
"about the same as x and y and the variable Y shows the",
"difference between x and y. When the data cloud in a mean",
"difference plot is centered around the horizontal zero line,",
"this is an indication of good data quality and good",
"normalization results. On the other hand, when the data",
"cloud deviates from the center line or has a 'banana'",
"shape, this constitutes an indication of systematic biases",
"present in the data and that either the chosen normalization",
"algorithm has not worked well, or that data are not",
"normalized. The smoothing curve that traverses the data",
"(red curve) summarizes the above trends.",collapse=" "
),
meanvar=paste(
"The mean-variance plot comprises a graphical means of",
"displaying a possible relationship between the means of",
"gene expression (counts) values and their variances",
"across replicates of the same biological condition. Thus",
"data can be inspected for possible overdispersion (greater",
"variability in a dataset than would be expected based on",
"a given simple statistical model). In such plots for",
"RNA-Seq data, overdispersion is usually manifested as",
"increasing variance with increasing gene expression",
"(counts) and it is summarized through a smoothing curve",
"(red curve). The following is taken from the EDASeq package",
"vignette: '<em>...although the Poisson distribution",
"is a natural and simple way to model count data, it has",
"the limitation of assuming equality of the mean and",
"variance. For this reason, the negative binomial",
"distribution has been proposed as an alternative when the",
"data show over-dispersion...'</em> If overdispersion is",
"not present, the data cloud is expected to be evenly",
"scattered around the smoothing curve.",collapse=" "
),
deheatmap=paste(
"The Differentially Expressed Genes (DEGs) heatmaps depict",
"how well samples from different conditions cluster",
"together according to their expression values after",
"normalization and statistical testing, for each requested",
"statistical contrast. If samples from the same biological",
"condition do not cluster together, this would constitute",
"a warning sign regarding the quality of the samples. In",
"addition, DEG heatmaps provide an initial view of",
"possible clusters of co-expressed genes.",collapse=" "
),
volcano=paste(
"A volcano plot is a scatterplot that is often used when",
"analyzing high-throughput -omics data (e.g. microarray",
"data, RNA-Seq data) to give an overview of interesting",
"genes. The log2 fold change is plotted on the x-axis and",
"the negative log10 p-value is plotted on the y-axis. A",
"volcano plot combines the results of a statistical test",
"(aka, p-values) with the magnitude of the change enabling",
"quick visual identification of those genes that display",
"large-magnitude changes that are also statistically",
"significant. The horizontal dashed line sets the threshold",
"for statistical significance, while the vertical dashed",
"lines set the thresholds for biological significance. It",
"should be noted that the volcano plots become harder to",
"interpret when using more than one statistical algorithm",
"and performing meta-analysis. This happens because the genes",
"that have stronger evidence of being differentially",
"expressed obtain lower p-values while the rest either",
"remain at similar levels or obtain higher p-values.",
"The result is a 'warped' volcano plot, with two",
"main data clouds: one in the upper part of the plot, and",
"one in the lower part of the plot. You can always zoom in",
"when using interacting mode (the default).",collapse=" "
),
biodist=paste(
"The chromosome and biotype distributions bar diagram for",
"Differentially Expressed Genes (DEGs) is split in two",
"panels: i) on the left panel DEGs are distributed per",
"chromosome and the percentage of each chromosome in the",
"genome is presented in grey bars, the percentage of DEGs",
"in each chromosome is presented in red lined bars and the",
"percentage of certain chromosomes in the distribution of",
"DEGs is presented in solid red bars. ii) on the right panel,",
"DEGs are distributed per biotype and the percentage of",
"each biotype in the genome (i.e. in the whole set of",
"features provided, for example, protein coding genes, non",
"coding RNAs or pseudogenes) is presented in grey bars,",
"the percentage of DEGs in each biotype is presented in",
"blue lined bars and the percentage of each biotype in",
"DEGs is presented in solid blue lines. The vertical green",
"line separates the most abundant biotypes (on the left-hand",
"side, corresponding to the left axis scale), from the rest",
"(on the right-hand side, corresponding to the right axis",
"scale). Otherwise, the lower abundance, biotypes would be",
"indistinguishable.",collapse=" "
),
filtered=paste(
"The chromosome and biotype distribution of filtered genes",
"is a quality control chart with two rows and four panels:",
"on the left panel of the first row, the bar chart depicts",
"the numbers of filtered genes per chromosome (actual numbers",
"shown above the bars). On the right panel of the first row,",
"the bar chart depicts the numbers of filtered genes per",
"biotype (actual numbers shown above the bars). On the left",
"panel of the second row, the bar chart depicts the fraction",
"of the filtered genes to the total genes per chromosome",
"(actual percentages shown above the bars). On the right",
"panel of the second row, the bar chart depicts the fraction",
"of the filtered genes to the total genes per biotype",
"(actual percentages shown above the bars). This plot",
"should indicate possible quality problems when for example",
"the filtered genes for a specific chromosome (or the",
"fraction) is extremely higher than the rest. Generally,",
"the fractions per chromosome should be uniform and the",
"fractions per biotype should be proportional to the biotype",
"fraction relative to the genome.",collapse=" "
),
venn=paste(
"The Venn diagrams are an intuitive way of presenting",
"overlaps between lists, based on the overlap of basic",
"geometrical shapes. The numbers of overlapping genes per",
"statistical algorithm are shown in the different areas",
"of the Venn diagrams, one for each contrast.",collapse=" "
)
),
references=list(
filein=list(
sam=paste("Statham, A.L., Strbenac, D., Coolen, M.W.,",
"Stirzaker, C., Clark, S.J., Robinson, M.D. (2010).",
"Repitools: an R package for the analysis of",
"enrichment-based epigenomic data. Bioinformatics",
"26(13), 1662-1663."),
bam=paste("Statham, A.L., Strbenac, D., Coolen, M.W.,",
"Stirzaker, C., Clark, S.J., Robinson, M.D. (2010).",
"Repitools: an R package for the analysis of",
"enrichment-based epigenomic data. Bioinformatics",
"26(13), 1662-1663."),
bed=paste("Lawrence, M., Gentleman, R., Carey, V.",
"(2009). rtracklayer: an R package for interfacing",
"with genome browsers. Bioinformatics 25(14),",
"1841-1842.")
),
norm=list(
edaseq=paste("Risso, D., Schwartz, K., Sherlock, G.,",
"and Dudoit, S. (2011). GC-content normalization",
"for RNA-Seq data. BMC Bioinformatics 12, 480."),
deseq=paste("Anders, S., and Huber, W. (2010).",
"Differential expression analysis for sequence",
"count data. Genome Biol 11, R106."),
edger=paste("Robinson, M.D., McCarthy, D.J., and",
"Smyth, G.K. (2010). edgeR: a Bioconductor package",
"for differential expression analysis of digital",
"gene expression data. Bioinformatics 26,",
"139-140."),
noiseq=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
nbpseq=paste("Di, Y, Schafer, D., Cumbie, J.S., and",
"Chang, J.H. (2011). The NBP Negative Binomial",
"Model for Assessing Differential Gene Expression",
"from RNA-Seq. Statistical Applications in",
"Genetics and Molecular Biology 10(1), 1-28."),
none=NULL
),
stat=list(
deseq=paste("Anders, S., and Huber, W. (2010).",
"Differential expression analysis for sequence",
"count data. Genome Biol 11, R106."),
edger=paste("Robinson, M.D., McCarthy, D.J., and",
"Smyth, G.K. (2010). edgeR: a Bioconductor package",
"for differential expression analysis of digital",
"gene expression data. Bioinformatics 26,",
"139-140."),
noiseq=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
limma=paste("Smyth, G. (2005). Limma: linear models",
"for microarray data. In Bioinformatics and",
"Computational Biology Solutions using R and",
"Bioconductor, G. R., C. V., D. S., I. R., and",
"H. W., eds. (New York, Springer), pp. 397-420."),
bayseq=paste("Hardcastle, T.J., and Kelly, K.A.",
"(2010). baySeq: empirical Bayesian methods for",
"identifying differential expression in sequence",
"count data. BMC Bioinformatics 11, 422."),
nbpseq=paste("Di, Y, Schafer, D., Cumbie, J.S., and",
"Chang, J.H. (2011). The NBP Negative Binomial",
"Model for Assessing Differential Gene Expression",
"from RNA-Seq. Statistical Applications in",
"Genetics and Molecular Biology 10(1), 1-28."),
ebseq=paste("Leng, N., Dawson, J.A., Thomson, J.A.,",
"Ruotti, V., Rissman, A.I., Smits, B.M., Haag,",
"J.D., Gould, M.N., Stewart, R.M., and",
"Kendziorski, C. (2013). EBSeq: an empirical",
"Bayes hierarchical model for inference in",
"RNA-seq experiments. Bioinformatics 29, 1035-1043")
),
meta=list(
fisher=paste("Fisher, R.A. (1932). Statistical",
"Methods for Research Workers (Edinburgh, Oliver",
"and Boyd)."),
fperm=paste("Fisher, R.A. (1932). Statistical",
"Methods for Research Workers (Edinburgh, Oliver",
"and Boyd)."),
whitlock=c(
paste("Whitlock, M.C. (2005). Combining",
"probability from independent tests:",
"the weighted Z-method is superior to Fisher's",
"approach. J Evol Biol 18, 1368-1373."),
paste("Schroder, M.S., Culhane, A.C., Quackenbush,",
"J., and Haibe-Kains, B. (2011). survcomp:",
"an R/Bioconductor package for performance",
"assessment and comparison of survival",
"models. Bioinformatics 27, 3206-3208.")
),
weight=paste("Genovese, C.R., Roeder, K., Wasserman,",
"L. (2006). False discovery control with p-value",
"weighting. Biometrika 93 (3): 509-524."),
simes=paste("Simes, R. J. (1986). An improved",
"Bonferroni procedure for multiple tests of",
"significance. Biometrika 73 (3): 751-754."),
none=NULL
),
multiple=list(
BH=paste("Benjamini, Y., and Hochberg, Y. (1995). ",
"Controlling the False Discovery Rate: A Practical",
"and Powerful Approach to Multiple Testing.",
"Journal of the Royal Statistical Society Series",
"B (Methodological) 57, 289-300."),
fdr=paste("Benjamini, Y., and Hochberg, Y. (1995). ",
"Controlling the False Discovery Rate: A Practical",
"and Powerful Approach to Multiple Testing.",
"Journal of the Royal Statistical Society Series",
"B (Methodological) 57, 289-300."),
BY=paste("Benjamini, Y., and Yekutieli, D. (2001). The",
"control of the false discovery rate in multiple",
"testing under dependency. Annals of Statistics",
"26, 1165-1188."),
bonferroni=paste("Shaffer, J.P. (1995). Multiple",
"hypothesis testing. Annual Review of",
"Psychology 46, 561-576."),
holm=paste("Holm, S. (1979). A simple sequentially",
"rejective multiple test procedure. Scandinavian",
"Journal of Statistics 6, 65-70."),
hommel=paste("Hommel, G. (1988). A stagewise rejective",
"multiple test procedure based on a modified",
"Bonferroni test. Biometrika 75, 383-386."),
hochberg=paste("Hochberg, Y. (1988). A sharper",
"Bonferroni procedure for multiple tests of",
"significance. Biometrika 75, 800-803."),
qvalue=paste("Storey, J.D., and Tibshirani, R. (2003).",
"Statistical significance for genomewide studies.",
"Proc Natl Acad Sci U S A 100, 9440-9445.")
),
figure=list(
mds=paste("Planet, E., Attolini, C.S., Reina, O.,",
"Flores, O., and Rossell, D. (2012). htSeqTools:",
"high-throughput sequencing quality control,",
"processing and visualization in R. Bioinformatics",
"28, 589-590."),
biodetection=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
countsbio=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
saturation=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
readnoise=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
gcbias=paste("Risso, D., Schwartz, K., Sherlock, G.,",
"and Dudoit, S. (2011). GC-content normalization",
"for RNA-Seq data. BMC Bioinformatics 12, 480."),
lengthbias=paste("Risso, D., Schwartz, K., Sherlock,",
"G., and Dudoit, S. (2011). GC-content",
"normalization for RNA-Seq data.",
"BMC Bioinformatics 12, 480."),
meandiff=paste("Risso, D., Schwartz, K., Sherlock, G.,",
"and Dudoit, S. (2011). GC-content normalization",
"for RNA-Seq data. BMC Bioinformatics 12, 480."),
meanvar=paste("Risso, D., Schwartz, K., Sherlock, G.,",
"and Dudoit, S. (2011). GC-content normalization",
"for RNA-Seq data. BMC Bioinformatics 12, 480."),
rnacomp=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
biodist=paste("Tarazona, S., Garcia-Alcalde, F.,",
"Dopazo, J., Ferrer, A., and Conesa, A. (2011).",
"Differential expression in RNA-seq: a matter of",
"depth. Genome Res 21, 2213-2223."),
venn=paste("Chen, H., and Boutros, P.C. (2011).",
"VennDiagram: a package for the generation of",
"highly-customizable Venn and Euler diagrams in R.",
"BMC Bioinformatics 12, 35."),
filtered=NULL
)
)
)
}
)
return(messages)
}
#' Interactive volcano plot helper
#'
#' Creates a list which contains the data series of a scatterplot, to be used for
#' serialization with highcharts JavaScript plotting.
#' framework. Internal use only.
#'
#' @param x The x coordinates (should be a named vector!).
#' @param y The y coordinates.
#' @param a Alternative names for each point.
#' @return A list that is later serialized to JSON.
#' @author Panagiotis Moulos
make.highcharts.points <- function(x,y,a=NULL) {
if (length(x)>0) {
n <- names(x)
x <- unname(x)
y <- unname(y)
stru <- vector("list",length(x))
if (is.null(a)) {
for (i in 1:length(x))
stru[[i]] <- list(
x=round(x[i],digits=3),
y=round(y[i],digits=3),
name=n[i]
)
}
else {
for (i in 1:length(x))
stru[[i]] <- list(
x=round(x[i],digits=3),
y=round(y[i],digits=3),
name=n[i],
alt_name=a[i]
)
}
}
else
stru <- list(x=NULL,y=NULL,name=NULL,alt_name=NULL)
return(stru)
}
#' Create a class vector
#'
#' Creates a class vector from a sample list. Internal to the \code{stat.*} functions.
#' Mostly internal use.
#'
#' @param sample.list the list containing condition names and the samples under
#' each condition.
#' @return A vector of condition names.
#' @author Panagiotis Moulos
#' @export
#' @examples
#' \dont'run{
#' sample.list <- list(A=c("A1","A2"),B=c("B1","B2","B3"))
#' clv <- as.class.vector(sample.list)
#'}
as.class.vector <- function(sample.list) {
classes <- vector("list",length(sample.list))
names(classes) <- names(sample.list)
for (n in names(sample.list))
classes[[n]] <- rep(n,times=length(sample.list[[n]]))
classes <- unlist(classes,use.names=FALSE)
names(classes) <- unlist(sample.list,use.names=FALSE)
return(classes)
}
#' Argument getter
#'
#' Get argument(s) from a list of arguments, e.g. normalization arguments.
#'
#' @param arg.list the initial list of a method's (e.g. normalization) arguments.
#' Can be created with the \code{\link{get.defaults}}
#' function.
#' @param arg.name the argument name inside the argument list to fetch its value.
#' @return The argument sub-list.
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' norm.list <- get.defaults("normalization","egder")
#' a <- get.arg(norm.list,c("main.method","logratioTrim"))
#'}
get.arg <- function(arg.list,arg.name) {
return(arg.list[arg.name])
}
#' Argument setter
#'
#' Set argument(s) to a list of arguments, e.g. normalization arguments.
#'
#' @param arg.list the initial list of a method's (e.g. normalization) arguments.
#' Can be created with the \code{\link{get.defaults}}
#' function.
#' @param arg.name a named list with names the new arguments to be set, and mebers
#' the values to be set or a vector of argument
#' names. In this case, \code{arg.value} must be supplied.
#' @param arg.value when \code{arg.name} is a vector of argument names, the values
#' corresponding to these arguments.
#' @return the \code{arg.list} with the changed \code{arg.value} for \code{arg.name}.
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' norm.list <- get.defaults("normalization","egder")
#' set.arg(norm.list,list(main.method="glm",logratioTrim=0.4))
#'}
set.arg <- function(arg.list,arg.name,arg.value=NULL) {
if (is.list(arg.name))
arg.list[names(arg.name)] <- arg.name
else if (is.character(arg.name)) {
tmp <- vector("list",length(arg.name))
names(tmp) <- arg.name
i <- 0
for (n in arg.name) {
i <- i + 1
tmp[[n]] <- arg.value[i]
}
arg.list[arg.name] <- tmp
}
return(arg.list)
}
#' Multiple testing correction helper
#'
#' A wrapper around the \code{\link{p.adjust}} function to include also the qvalue
#' adjustment procedure from the qvalue package.
#' Internal use.
#'
#' @param p a vector of p-values.
#' @param m the adjustment method. See the help of \code{\link{p.adjust}}.
#' @author Panagiotis Moulos
wp.adjust <- function(p,m) {
if (m=="qvalue")
return(qvalue(p))
else
return(p.adjust(p,method=m))
}
#' List apply helper
#'
#' A wrapper around normal and parallel apply (\code{\link{mclapply}} or parallel
#' package) to avoid excessive coding for control of single or parallel code
#' execution. Internal use.
#'
#' @param m a logical indicating whether to execute in parallel or not.
#' @param ... the rest arguments to \code{\link{lapply}} (or \code{mclapply})
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' multic <- check.parallel(0.8)
#' # Test meaningful only in machines where parallel computation supported
#' if (multic) {
#' system.time(r<-wapply(TRUE,1:10,function(x) runif(1e+6)))
#' system.time(r<-wapply(FALSE,1:10,function(x) runif(1e+6)))
#' }
#'}
wapply <- function(m,...) {
if (m)
return(mclapply(...,mc.cores=getOption("cores"),mc.set.seed=FALSE))
else
return(lapply(...))
}
#' Filtering helper
#'
#' Low score filtering function. Internal use.
#'
#' @param x a data numeric matrix.
#' @param f a threshold.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data("mm9.gene.data",package="metaseqR")
#' counts <- as.matrix(mm9.gene.counts[,9:12])
#' f <- filter.low(counts,median(counts))
#'}
filter.low <- function(x,f) { return(all(x<=f)) }
#' Filtering helper
#'
#' High score filtering function. Internal use.
#'
#' @param x a data numeric matrix.
#' @param f a threshold.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' data("mm9.gene.data",package="metaseqR")
#' counts <- as.matrix(mm9.gene.counts[,9:12])
#' f <- filter.high(counts,median(counts))
#'}
filter.high <- function(x,f) { return(all(x>=f)) }
#' Message displayer
#'
#' Displays a message during execution of the several functions. Internal use.
#'
#' @param ... a vector of elements that compose the display message.
#' @export
#' @author Panagiotis Moulos
#' @examples
#' \dontrun{
#' i <- 1
#' disp("Now running iteration ",i,"...")
#'}
disp <- function(...) {
verbose <- get("VERBOSE",envir=meta.env)
if (!is.null(verbose) && verbose) {
message("\n",...,appendLF=FALSE)
}
logger <- get("LOGGER",envir=meta.env)
levalias <- c("one","two","three","four","five")
if (!is.null(logger)) {
switch(levalias[level(logger)],
one = { debug(logger,paste0(...)) },
two = { info(logger,gsub("\\n","",paste0(...))) },
three = { warn(logger,gsub("\\n","",paste0(...))) },
four = { error(logger,gsub("\\n","",paste0(...))) },
five = { fatal(logger,gsub("\\n","",paste0(...))) }
)
}
}
stopwrap <- function(...,t="fatal") {
logger <- get("LOGGER",envir=meta.env)
if (!is.null(logger)) {
if (t=="fatal")
fatal(logger,gsub("\\n","",paste0(...)))
else
error(logger,gsub("\\n","",paste0(...)))
}
stop(paste0(...))
}
warnwrap <- function(...,now=FALSE) {
logger <- get("LOGGER",envir=meta.env)
if (!is.null(logger))
warn(logger,gsub("\\n","",paste0(...)))
if (now)
warning(paste0(...),call.=FALSE,immediate.=TRUE)
else
warning(paste0(...),call.=FALSE)
}
elap2human <- function(start.time) {
start.time <- as.POSIXct(start.time)
dt <- difftime(Sys.time(),start.time,units="secs")
ndt <- as.numeric(dt)
if (ndt<60)
format(.POSIXct(dt,tz="GMT"),"%S seconds")
else if (ndt>=60 && ndt<3600)
format(.POSIXct(dt,tz="GMT"),"%M minutes %S seconds")
else if (ndt>=3600 && ndt<86400)
format(.POSIXct(dt,tz="GMT"),"%H hours %M minutes %S seconds")
else if (ndt>=86400)
format(.POSIXct(dt,tz="GMT"),"%d days %H hours %M minutes %S seconds")
}
deprecated.warning <- function(func) {
switch(func,
read.targets = {
warnwrap("\"yes\" and \"no\" for read strandedness have been ",
"deprecated. Please use \"forward\", \"forward\" or \"no\". ",
"Replacing \"yes\" with \"forward\"...")
}
)
}
metaseqR.version <- function() {
anchor.version <- "1.5-2"
current.version <- as.character(numeric_version(packageVersion("metaseqR")))
current.version <- gsub("(.*)\\.", "\\1-", current.version)
cmp <- compareVersion(current.version,anchor.version)
return(list(current=current.version,compare=cmp))
}
##' Fixed annotation updater
##'
##' A function to update the fixed annotations contained to avoid downloading every
##' time if it's not embedded. It has no parameters.
##'
##' @return This function does not return anything. It updates the fixed annotation
##' files instead.
##' @note This function cannot be used by users when the package is installed. For
##' this reason it is not exported. If you want to maintain a local copy of the
##' package and update annotation at will, you can download the package source.
## @author Panagiotis Moulos
##' @examples
##' \dontrun{
##' library(metaseqr)
##' annotations.update()
##'}
#annotations.update <- function() {
# if(!require(biomaRt))
# stopwrap("Bioconductor package biomaRt is required to update annotations!")
# VERBOSE <<- TRUE
# supported.types <- c("gene","exon")
# supported.orgs <- c("hg18","hg19","mm9","mm10","rn5","dm3","danrer7")
# if (exists("ANNOTATION")) {
# for (type in supported.types) {
# for (org in supported.orgs) {
# disp("Downloading and writing ",type,"s for ",org,"...")
# tryCatch({
# tmp <- get.annotation(org,type)
# var.name <- paste(org,type,sep=".")
# assign(var.name,tmp)
# #if (!file.exists(ANNOTATION$ENSEMBL[[toupper(type)]]))
# # dir.create(ANNOTATION$ENSEMBL[[toupper(type)]],recursive=TRUE)
# #gzfh <- gzfile(file.path(ANNOTATION$ENSEMBL[[toupper(type)]],
# # paste(org,".txt.gz",sep="")),"w")
# #write.table(tmp,gzfh,sep="\t",row.names=FALSE,quote=FALSE)
# #close(gzfh)},
# save(list=eval(parse(text="var.name")),file=file.path(ANNOTATION,
# paste(org,type,"rda",sep=".")),compress=TRUE)},
# error=function(e) {
# disp("!!! Probable problem with connection to Biomart...")
# },
# finally=""
# )
# }
# }
# disp("Finished!\n")
# }
# else
# stopwrap("metaseqr environmental variables are not properly set up! ",
# "Annotations cannot be updated...")
#}
##' Fixed annotation reader
##'
##' A function to read fixed annotations from the local repository.
##'
##' @param org one of the supported organisms.
##' @param type \code{"gene"} or \code{"exon"}.
##' @return A data frame with the \code{type} annotation for \code{org}.
##' @author Panagiotis Moulos
##' @export
##' @examples
##' \dontrun{
##' ann <- read.annotation("hg19","gene")
##'}
#read.annotation <- function(org,type) {
# data(list=paste(org,type,sep="."))
# ann <- eval(parse(text=paste(org,type,sep=".")))
# if (type=="gene")
# rownames(ann) <- ann$gene_id
# else if (type=="exon")
# rownames(ann) <- ann$exon_id
# return(ann)
#}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.