#' Evaluates burden of \emph{de novo} variation against expectation
#'
#' Determines whether the test population carry more \emph{de novo} variants
#' than expected. Variants may be grouped by variant class (e.g. are there more
#' LOF variants than expected, across the whole dataset?), or by gene (are there
#' more variants of a given class in SCN2A?).
#'
#' Analyses can be restricted to a subset of genes, and/or a subset of variant
#' classes
#'
#' See vignette("denovolyzeR_intro") for more information.
#'
#' @param genes A vector of genes containing de novo variants.
#' @param classes A vector of classes of de novo variants. Standard supported
#' classes are "syn" (synonymous), "mis" (missense), "non" (nonsense),
#' "splice" (splice), "frameshift" (frameshift) and "lof" (loss of function =
#' non + splice + frameshift). Additional classes that are supported by the
#' code, but are not included in the built-in probability tables, are
#' "stoploss","startloss", "misD" (damaging missense). These labels may be
#' used for user-supplied probability tables. If "misD" is present, then "mis"
#' (in the input) implies non-damaging missense.
#' @param nsamples Number of individuals considered in de novo analysis.
#' @param groupBy Results can be tabulated by "gene", or by variant "class"
#' @param includeClasses Determines which variant classes are tabulated in
#' output. In addition to the input classes, summaries can be produced for
#' "prot" (protein-altering = mis + lof), "all", and "protD" (protein damaging
#' = misD + lof, only available if misD included in user-specified probability
#' table). If "misD" is present, then "mis" will return statistics for all
#' missense. Non-damaging missense are not analysed separately.
#' @param includeGenes Genes to include in analysis. "all" or a vector of gene
#' names.
#' @param geneId Gene identifier used. One of "hgncID", "hgncSymbol",
#' "enstID", "ensgID" or "geneName" (default, equals ensembl "external_gene_name")
#' @param signifP Number of significant figures used to round p-values in
#' output.
#' @param roundExpected Number of decimal places used to round expected burdens
#' in output.
#' @param probTable Probability table. A user-defined table of probabilities can
#' be provided here, to replace the probability table included in the package.
#' @param misD If the user-specified probability table contains probabilities
#' for a sub-category of missense variants (e.g. predicted to be damaging by
#' an in silico algorithm), this column should be called misD, or the
#' alternative name should be specified here.
#'
#'
#' @return Returns a data frame
#'
#' @export denovolyze denovolyzeByClass denovolyzeByGene
#'
#' @import dplyr
#' @importFrom stats ppois
#' @importFrom reshape2 melt
#' @importFrom reshape2 dcast
#'
#' @examples
#'
#' ### denovolyze
#'
#' denovolyze(genes=autismDeNovos$gene,
#' classes=autismDeNovos$class,
#' nsamples=1078)
#'
#' ### denovolyzeByClass
#'
#' denovolyzeByClass(genes=autismDeNovos$gene,
#' classes=autismDeNovos$class,
#' nsamples=1078)
#'
#' # this convenience function is identical to:
#'
#' denovolyze(genes=autismDeNovos$gene,
#' classes=autismDeNovos$class,
#' nsamples=1078,
#' groupBy="class",
#' includeClasses=c("syn","mis","lof","prot","all"),
#' includeGenes="all"
#' )
#'
#' ### denovolyzeByGene
#'
#' denovolyzeByGene(genes=autismDeNovos$gene,
#' classes=autismDeNovos$class,
#' nsamples=1078)
#'
#' # this is identical to:
#'
#' denovolyze(genes=autismDeNovos$gene,
#' classes=autismDeNovos$class,
#' nsamples=1078,
#' groupBy="gene",
#' includeClasses=c("lof","prot"),
#' includeGenes="all"
#' )
#'
##### ----------------------------------------------------
denovolyze <- function(genes,classes,nsamples,
groupBy="class",
includeGenes="all",
includeClasses=c("syn","mis","misD",
"non","stoploss","startgain",
"splice","frameshift","lof","prot",
"protD", "all"),
geneId="geneName",
signifP=3,
roundExpected=1,
probTable=NULL,
misD=NULL) {
# this line defines variables in order to pass R CMD check
# these are column names used in dplyr::select(x) statement, but R CMD CHECK interprets them as global variables without visible binding
gene <- value <- enrichment <- Row.names <- ends_with <- NULL
# check inputs --------------------------
parseInput(genes,
classes,
nsamples,
groupBy,
includeGenes,
includeClasses,
geneId,
signifP,
roundExpected,
probTable)
# With an external probTable table, label the damaging missense column as misD
# --------------------------
if(!is.null(misD)){names(probTable)[names(probTable)==misD] <- "misD"}
# # Include all genes if indicated --------------------------
# # If byClass - include all genes. if byGene - include only genes containing at least one variant
if(toupper(includeGenes[1])=="ALL" & length(includeGenes==1)){
if(groupBy=="class"){
includeGenes <- toupper(probTable$gene)
} else if (groupBy=="gene"){
includeGenes <- unique(genes)
}
}
# generate meta-classes: "lof", "prot", "protD" and "all". if "misD" is present, "mis" in input = not-damaging mis. In output mis will refer to all missense.
# --------------------------
input <- data.frame(gene=genes,class=classes,stringsAsFactors=F)
input$class.1[input$class %in% c("splice","frameshift","non","stoploss","startloss")] <- "lof"
#temporarily relabel non-damaging missense as mis_notFilter.
input$class[input$class == "mis"] <- "mis_notFilter"
input$class.2[input$class %in% c("mis_notFilter","misD")] <- "mis"
input$class.3[input$class %in% c("splice","frameshift","non","stoploss","startloss",
"lof",
"mis_notFilter","misD")] <- "prot"
input$class.4[input$class %in% c("splice","frameshift","non","stoploss","startloss",
"lof","misD")] <- "protD"
input$class.5 <- "all"
input <- reshape2::melt(input,id.vars="gene") %>%
dplyr::select(gene, class = value) %>%
filter(!is.na(class)) %>%
filter(class!="mis_notFilter")
# tabulate observed & expected numbers, either by gene or by class
# --------------------------
if(groupBy=="class"){
observed <-
input %>%
filter(gene %in% includeGenes, class %in% includeClasses) %>%
group_by(class) %>%
summarise(
observed = n()
)
expected <- probTable %>%
filter(gene %in% includeGenes, class %in% includeClasses) %>%
group_by(class) %>%
summarise(
expected = 2*sum(value, na.rm=T)*nsamples
)
expected$class <- as.character(expected$class)
output <- left_join(observed,expected,by=c("class"))
} else if(groupBy=="gene"){
obs <- input %>%
filter(gene %in% includeGenes, class %in% includeClasses) %>%
group_by(gene,class) %>%
summarise(
observed = n()
)
exp <- probTable %>%
filter(gene %in% includeGenes, class %in% includeClasses) %>%
group_by(gene,class) %>%
summarise(
expected = 2*sum(value, na.rm=T)*nsamples
)
exp$class <- as.character(exp$class)
output <- merge(obs,exp,by=c("gene","class"),all=T)
}
# calculate poisson stats and enrichments ____________________
output[is.na(output)] <- 0
output$enrichment <- signif(output$observed/output$expected,signifP)
output$pValue <- signif(ppois(output$observed-1,lambda=output$expected,lower.tail=F),signifP)
if("exp" %in% names(output)){
output$exp <- round(output$exp,roundExpected)
} else if ("expected" %in% names(output)){
output$expected <- round(output$expected,roundExpected)
}
#warn if includeClasses includes variant classes not found in the data
extraClasses <- includeClasses[!includeClasses %in% output$class]
if(length(extraClasses)>0){
extraClasses <- paste(extraClasses,collapse=", ")
warning(paste("The following variant classes specified in includeClasses are not represented in the data:",extraClasses))
}
# impose a standard order on the variant classes for uniform output
# --------------------------
# start with standard list of classes in preferred order
standardClassLevels <- c("syn","misD","mis",
"non","stoploss","startloss",
"splice","frameshift","lof","prot","protD","all")
# append any classes in probTable that are not already in this list
inUseClasses <- as.character(unique(probTable$class))
extraClassLevels <- inUseClasses[!inUseClasses %in% standardClassLevels]
myClassLevels <- c(standardClassLevels,
extraClassLevels)
# factorise classes & drop redundant levels
output$class <- factor(output$class, levels=myClassLevels) %>% droplevels
output <- output %>% arrange(class)
# when analysing by gene, apply additional formatting to arrange results for different variant classes side by side
# only applied if >1 gene, so that for a single gene format 1 row per class. Otherwise 1 row per gene.)
# --------------------------
if(groupBy=="gene" & length(includeGenes)>1){
output <- output %>%
dplyr::select(-enrichment) %>%
reshape2::melt(id.vars=c("gene","class")) %>%
reshape2::dcast(formula = gene ~ class + variable)
#order by p-value
myIndex <- output %>% dplyr::select(ends_with("pValue")) %>% apply(MARGIN=1,min, na.rm=T) %>% order()
output <- output[myIndex,]
}
class(output) <- "data.frame"
return(output)
}
##### ----------------------------------------------------
#' @describeIn denovolyze denovolyzeByClass
denovolyzeByClass <- function(genes,classes,nsamples,
groupBy="class",
includeGenes="all",
includeClasses=c("syn","mis","lof","prot","all"),
geneId="geneName",
signifP=3,roundExpected=1,
probTable=NULL){
denovolyze(genes,classes,nsamples,groupBy,includeGenes,includeClasses,geneId,signifP,roundExpected,probTable)
}
##### ----------------------------------------------------
#' @describeIn denovolyze denovolyzeByGene
denovolyzeByGene <- function(genes,classes,nsamples,
groupBy="gene",
includeGenes="all",
includeClasses=c("lof","prot"),
geneId="geneName",
signifP=3,roundExpected=1,
probTable=NULL){
denovolyze(genes,classes,nsamples,groupBy,includeGenes,includeClasses,geneId,signifP,roundExpected,probTable)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.