#' Computes the normalized empirical Shannon Entropy for counts delivered in
#' the argument \code{counts.table}. Basis of the \code{log} function is
#' natural and normalization is done by division by the maximum entropy
#' \code{log(length(counts.table))} (see
#' \href{https://en.wikipedia.org/wiki/Entropy_(information_theory)#Efficiency)}{Normalized
#' Entropy}.
#'
#' @param counts.table result of invoking \code{base::table(count.vector)}
#' @param log.base The base of the logarithm. Default is the natural logarithm,
#' \code{getOption('GeneFamilies.entropy.log.base', base::exp(1))}.
#'
#' @export
#' @return A numeric value element [0,1], the normalized Shannon Entropy.
shannonEntropy <- function(counts.table, log.base = getOption("GeneFamilies.entropy.log.base",
base::exp(1))) {
if (length(counts.table) <= 1)
return(0)
c.t.s <- sum(counts.table)
-sum(sapply(counts.table, function(x) x/c.t.s * log(x/c.t.s, base = log.base)))/log(length(counts.table),
base = log.base)
}
#' Testing function \code{GeneFamilies::shannonEntropy}
#'
#' @export
#' @return \code{TRUE} if and only if all tests are passed successfully.
testShannonEntropy <- function() {
test.1 <- identical(shannonEntropy(table(c())), 0)
test.2 <- identical(shannonEntropy(table(NULL)), 0)
test.3 <- identical(shannonEntropy(table(NA)), 0)
test.4 <- identical(shannonEntropy(table(1)), 0)
test.5 <- identical(shannonEntropy(table(1:2)), 1)
all(c(test.1, test.2, test.3, test.4, test.5))
}
#' Infers the number of genes belonging to the given species.
#'
#' @param 'fam' a character vector of gene identifiers representing the
#' argument gene family.
#' @param 'spec.cds.ids' a named list of character vectors indicating which
#' gene identifiers belong to which species. Default is set to constant
#' 'spec.gene.ids' declared in R/zzz.R
#'
#' @return An integer vector with names equeal to the names of argument list
#' 'spec.cds.ids' and values indicating how many gene members were found in the
#' respective species.
#' @export
speciesComposition <- function(fam, spec.cds.ids = spec.gene.ids) {
spec.cnts <- setNames(as.integer(rep(0, (length(spec.cds.ids) + 1))),
c(names(spec.cds.ids), "fam.size"))
for (spec in names(spec.cds.ids)) {
x <- length(intersect(fam, spec.cds.ids[[spec]]))
spec.cnts[[spec]] <- x
if (sum(spec.cnts) == length(fam))
break
}
spec.cnts[["fam.size"]] <- length(fam)
spec.cnts
}
#' Generate a data frame of the list of species composition integer vectors.
#' Rows are named as families and columns as species.
#'
#' @param spec.comps.lst - A list of integer vectors generated by lapply
#' function 'speciesComposition' on the gene families character vectors.
#' @param column.names - A character vector of column names. Default is the
#' names slot of the first entry of 'spec.comps.lst'
#' @param family.names - A character vector of family names. Default is the
#' names slot of argument list 'spec.comps.lst'
#'
#' @return A data frame: One row per family and one column per species with an
#' extra column of the family sizes.
#' @export
speciesCompositionsToDataFrame <- function(spec.comps.lst, column.names = names(spec.comps.lst[[1]]),
family.names = names(spec.comps.lst)) {
as.data.frame(matrix(unlist(spec.comps.lst), byrow = TRUE, ncol = length(column.names),
nrow = length(spec.comps.lst), dimnames = list(family.names, column.names)),
stringsAsFactors = FALSE)
}
#' Extracts the number CAFE assigned the species during its analysis.
#'
#' @param cafe.id.line the string representing the line number three in the
#' cafe result file
#' @param species.name the species name as given in 'cafe.id.line' to extract
#' the CAFE internal number for
#'
#' @return The CAFE internal node number representing the species' branch in
#' the phylogeny. Returned as string.
#' @export
getSpeciesNodeNumberFromCafe <- function(cafe.id.line, species.name = "chi") {
regmatches(cafe.id.line, regexec(paste(species.name, "<(\\d+)>", sep = ""),
cafe.id.line))[[1]][[2]]
}
#' Extract the branch specific extension / contraction P-Values that were
#' assigned to those branches that lead to the species' tips in the phylogeny.
#'
#' @param cafe.br.spec.pvals the string parsed out from the CAFE result table
#' @param spec.pos named integer vector holding the positions of the species
#' specific p-values in 'cafe.br.spec.pvals'. See ./exec/parseCafeResult.R for
#' more details
#'
#' @export
#' @return A data frame with columns the species and cells their specific
#' P-Values for expansion or contraction in the respective gene family.
parseCafeBranchSpecificPValues <- function(cafe.br.spec.pvals, spec.pos) {
cafe.res <- strsplit(gsub("[()\\s]", "", cafe.br.spec.pvals, perl = TRUE),
",")[[1]]
#' Do not warn about the intent to make a numeric NA from the CAFE '-'
#' character:
suppressWarnings({
data.frame(lapply(spec.pos, function(x) cafe.res[[x]]), stringsAsFactors = FALSE)
})
}
#' Alternative method to detect expanded or compressed families comparing one
#' species with the background of the others. P-Values are obtained from an
#' outlier chisquare test; see package outliers::chisq.out.test(...) for
#' details.
#'
#' @param x named integer vector representing the gene family to test. x holds
#' the number of gene members per species.
#' @param test.species a string being used ase a name in argument 'x'
#' indicating which species is the alternative hypothesis of expansion (or
#' contraction) based upon
#' @param background.species a character vector holding the names of the
#' species the background distribution of numbers of the species specific
#' family members is based upon.
#' @param expansion.test boolean indicating whether to test for expansion
#' (default TRUE) or compression (set to FALSE).
#'
#' @return numeric p-value obtained from outliers::chisq.out.test(x) or NA, if
#' expansion-test does not indicate a candidate outlier.
#' @import outliers
#' @export
familyExpanded <- function(x, test.species = "chi", background.species = setdiff(names(x),
test.species), expansion.test = max) {
if (expansion.test && x[test.species] > max(x[background.species]) ||
!expansion.test && x[test.species] < min(x[background.species])) {
chisq.out.test(x)$p.value
} else NA
}
#' Given a character vector regular expressions are applied to extract
#' substrings of the type 'fam1234'.
#'
#' @param char.vec A vector of strings each containing a family name, e.g. file
#' paths like 'families/fam1234/fam1234_ml_tree.newick'
#'
#' @return a character vector of extracted family names
#' @export
extractFamilyName <- function(char.vec) {
regmatches(char.vec, regexpr("(fam\\d+)", char.vec, perl = TRUE))
}
#' Generates a list of composite Annotations found for the argument genes based
#' on the argument gene (function) annotation data.frame. Composite annotations
#' are sorted alphabetically and then concatonated with ',' as separator.
#'
#' @param gene.accs The identifiers or accessions of the genes to compute the
#' entropy for
#' @param gene.annos The data.frame holding the annotations for the genes in
#' 'gene.accs'. Default is all available InterPro annotations expected to be
#' found in 'all.ipr'
#' @param gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param anno.col the column of 'gene.annos' in which to lookup the function
#' annotation for the genes in 'gene.accs'. Default is 2
#'
#' @export
#' @return An character vector with names being the gene identifiers
#' 'gene.accs' and values the composite (function) annotations.
compositeAnnotations <- function(gene.accs, gene.annos = all.ipr, gene.col = 1,
anno.col = 2) {
setNames(unlist(lapply(gene.accs, function(x) {
if (x %in% gene.annos[, gene.col]) {
paste(sort(gene.annos[which(gene.annos[, gene.col] == x), anno.col]),
collapse = ",")
} else NA
})), gene.accs)
}
#' Compute Shannon Entropy (\code{\link[entropy]{entropy}}) as a measure of
#' annotation (function) diversity within a given set of genes. If gene(s) have
#' multiple annotations these are sorted and concatonated in order to treat
#' them as single composite annotations. This means, that e.g if all genes have
#' the same three annotations the resulting entropy will be 0.
#'
#' @param gene.accs The identifiers or accessions of the genes to compute the
#' entropy for
#' @param gene.annos The data.frame holding the annotations for the genes in
#' 'gene.accs'. Default is all available InterPro annotations expected to be
#' found in 'all.ipr'
#' @param gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param anno.col the column of 'gene.annos' in which to lookup the function
#' annotation for the genes in 'gene.accs'. Default is 2
#' @param entropy.funk The function to be used to compute the Shannon Entropy.
#' Default is \code{getOption( 'GeneFamilies.entropy.function',
#' entropy::entropy )}. Set to \code{GeneFamilies::shannonEntropy} if you want
#' normalized Shannon Entropy. See
#' \href{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4696313/}{Entropy
#' Explorer R package}.
#'
#' @export
#' @return A list with two entries: 'entropy' is the computed shannon entropy,
#' and 'gene.annos.freqs' is an instance of table showing the annotation
#' architectures and their frequencies
annotationBasedShannonEntropy <- function(gene.accs, gene.annos = all.ipr,
gene.col = 1, anno.col = 2, entropy.funk = getOption("GeneFamilies.entropy.function",
entropy::entropy)) {
c.a <- compositeAnnotations(gene.accs, gene.annos, gene.col, anno.col)
c.a.i <- !is.na(c.a)
if (any(c.a.i)) {
gene.annos.freqs <- table(c.a[c.a.i])
list(gene.annos.freqs = gene.annos.freqs, entropy = entropy.funk(gene.annos.freqs))
} else list(gene.annos.freqs = NA, entropy = NA)
}
#' Computes for each gene group in \code{groups.ids} the annotation based
#' Shannon Entropy in parallel. Set \code{options(mc.cores=detectCores())} for
#' maximum speed.
#'
#' @param groups.ids The IDs of the gene groups to compute the annotation based
#' Shannon Entropy for.
#' @param groups.2.genes.lst A list with names intersecting with argument
#' \code{groups.ids} and values character vectors holding gene-IDs found in
#' \code{gene.annos}.
#' @param gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param anno.col the column of 'gene.annos' in which to lookup the function
#' annotation for the genes in 'gene.accs'. Default is 2
#'
#' @export
#' @return A list with names \code{groups.ids} and values the computed
#' annotation based Shannon Entropies.
geneGroupsAnnotationBasedShannonEntropy <- function(groups.ids, groups.2.genes.lst,
gene.annos = all.ipr, gene.col = 1, anno.col = 2) {
setNames(mclapply(groups.ids, function(x) {
annotationBasedShannonEntropy(groups.2.genes.lst[[x]], gene.annos,
gene.col, anno.col)$entropy
}), groups.ids)
}
#' Computes the annotation (function) based Shannon Entropy per gene group
#' separate for each species. Note it is highly recommended to set
#' \code{options(mc.cores=detectCores())}
#'
#' @param groups.df An instance of base::data.frame with at least three
#' columns. One must hold the gene group identifiers, another the species
#' names, and another the gene identifiers (accessions) belonging to the
#' respective group and species.
#' @param group.col The column name of 'groups.df' in which to find the gene
#' group identifiers. Default is 'Tandem.Cluster'
#' @param species.col The column name of 'groups.df' in which to find the
#' species names. Must intersect with 'species' argument. Default is 'Species'.
#' @param gene.col The column name of 'groups.df' in which to find the
#' gene identifiers (accessions). Default is 'Gene'.
#' @param group.ids all group identifiers for which to compute the species
#' specific annotation based Shannon Entropy. Default is all distinct gene
#' groups found in 'groups.df'.
#' @param species a character vector of species names present in 'groups.df'
#' 'species.col'. Species not present in this argument will not be considered.
#' Default is all distinct species names found in 'groups.df'.
#' @param gene.annos The data.frame holding the annotations for the genes in
#' 'gene.accs'. Default is all available InterPro annotations expected to be
#' found in 'all.ipr'
#' @param annos.gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param annos.anno.col the column of 'gene.annos' in which to lookup the
#' function annotation for the genes in 'gene.accs'. Default is 2
#'
#' @export
#' @return An instance of base::data.frame with several columns. The first
#' holds the gene group identifiers obtained from 'groups.df' and one numeric
#' column for each species found in 'species'. The later columns hold the
#' species specific annotation entropies for the respective gene groups; NA
#' values where no genes of that species were found to be members of the
#' respective gene group.
computeSpeciesSpecificAnnotationDiversityPerGroup <- function(groups.df,
group.col = "Tandem.Cluster", species.col = "Species", gene.col = "Gene",
group.ids = unique(groups.df[, group.col]), species = unique(groups.df[,
species.col]), gene.annos = all.ipr, annos.gene.col = 1, annos.anno.col = 2) {
res.df <- as.data.frame(do.call("rbind", mclapply(group.ids, function(x) {
clstr <- groups.df[which(groups.df[, group.col] == x), ]
clstr.spec.entrp <- lapply(species, function(y) {
if (y %in% clstr[, species.col]) {
annotationBasedShannonEntropy(clstr[which(clstr[, species.col] ==
y), gene.col], gene.annos = gene.annos, gene.col = annos.gene.col,
anno.col = annos.anno.col)$entropy
} else NA
})
setNames(as.list(unlist(c(x, clstr.spec.entrp))), c(group.col,
species))
})), stringsAsFactors = FALSE)
for (i in species) {
res.df[, i] <- as.numeric(res.df[, i])
}
res.df
}
#' Wrapper for function
#' \code{computeSpeciesSpecificAnnotationDiversityPerGroup}. Instead of species
#' affiliation the distinction here is wether genes are Orthologs or
#' Non-Orthologs.
#'
#' @param gene.groups.df An instance of base::data.frame with at least two
#' columns. One must hold the gene group identifiers and the other the gene
#' identifiers (accessions) belonging to the respective group.
#' @param group.col The column name of 'gene.groups.df' in which to find the
#' gene group identifiers. Default is 'Family'.
#' @param gene.col The column name of 'gene.groups.df' in which to find the
#' gene identifiers (accessions). Default is 'Gene'.
#'
#' @export
#' @return An instance of base::data.frame with several columns. The first
#' holds the gene group identifiers obtained from 'gene.groups.df' and one
#' numeric column for each \code{Orthologs} and \code{Non-Orthologs}. The later
#' columns hold the gene-class specific annotation entropies for the respective
#' gene groups; NA values where no genes of that species were found to be
#' members of the respective gene group.
computeOrthologSpecificAnnotationDiversityPerGroup <- function(gene.groups.df,
orthologs = orthologs.genes, group.col = "Family", gene.col = "Gene") {
gene.groups.extended.df <- gene.groups.df
gene.groups.extended.df$Species <- as.character(unlist(mclapply(gene.groups.extended.df[,
gene.col], function(x) if (x %in% orthologs) "Orthologs" else "Non-Orthologs")))
computeSpeciesSpecificAnnotationDiversityPerGroup(gene.groups.extended.df,
group.col = group.col, gene.col = gene.col)
}
#' Statistically tests the NULL Hypotheses that the annotation (function) based
#' species specific Shannon Entropy found in the gene groups for
#' 'spec.2.compare' is not greater than that of the other species. The
#' population means are compared using a t.test and the whole distributions are
#' compared using a Kolmogoroff Smirnoff test. Correction for multiple
#' hypotheses testing is carried out using the 'BY' method (see base::p.adjust
#' for more details).
#'
#' @param entropy.df The result of calling function
#' \code{computeSpeciesSpecificAnnotationDiversityPerGroup(...)}
#' @param spec.2.compare The species which to compare to all others.
#' @param other.specs The species to compare pairwisely with
#' \code{spec.2.compare}
#'
#' @export
#' @return An instance of base::data.frame with 4 columns. The first two
#' indicate which two species are compared in the respective row, the 3rd
#' column holds the P-Values obtained from the pairwise T-Test, and the 4th
#' column the P-Value obtained from the KS-Test. All P-Values are corrected for
#' multiple hypotheses testing.
testAnnotationBasedSpeciesSpecificNeofunctionalization <- function(entropy.df,
spec.2.compare, other.specs) {
spec.vs.others = expand.grid(spec.2.compare, other.specs)
spec.vs.others$t.test.greater <- p.adjust(as.numeric(unlist(apply(spec.vs.others,
1, function(x) {
t.test(entropy.df[, x[[1]]], entropy.df[, x[[2]]], alternative = "greater")$p.value
}))), method = "BY")
spec.vs.others$KS.test.less <- p.adjust(as.numeric(unlist(apply(spec.vs.others,
1, function(x) {
ks.test(entropy.df[, x[[1]]], entropy.df[, x[[2]]], alternative = "less")$p.value
}))), method = "BY")
spec.vs.others
}
#' Uses Linux's AWK and package data.table to parse the output of MCL into an
#' instance of base::data.frame.
#'
#' @param path.2.mcl.out The valid path to the output file of the \code{mcl}
#' program.
#' @param family.name.prefix The prefix used to name gene families (clusters).
#' Default is \code{'cluster_'} resulting in 'cluster_1', 'cluster_2', etc.
#'
#' @export
#' @return A data.frame with two columns: 'Family', and 'Gene' mapping the
#' families to their gene members.
readMclOutput <- function(path.2.mcl.out, family.name.prefix = "cluster_") {
sys.cmd <- paste("awk '{split($0,a,\"\\t\"); for (i in a) { print \"",
family.name.prefix, "\" NR \"\\t\" a[i] }}' ", path.2.mcl.out,
sep = "")
fread(sys.cmd, stringsAsFactors = FALSE, sep = "\t", colClasses = rep("character",
2), col.names = c("Family", "Gene"), na.strings = "", data.table = FALSE)
}
#' Converts the data.frame of gene families into a list of gene families.
#'
#' @param mcl.df A data frame with two columns 'Family' and 'Gene' mapping the
#' families to their gene members. See \code{readMclOutput} for more details.
#'
#' @export
#' @return A list with names the families in argument \code{mcl.df} and values
#' the respective gene members joined into sub-lists.
mclDataFrameAsList <- function(mcl.df) {
fams <- unique(mcl.df$Family)
setNames(mclapply(fams, function(x) as.list(unlist(mcl.df[which(mcl.df$Family ==
x), "Gene"]))), fams)
}
#' Simple wrapper around seqinr::read.fasta for DNA data. Function deletes gene
#' descriptions following the first blank character after the gene accession.
#' For example the name '>AT1G12345 crazy horse gymnast gene' becomes
#' 'AT1G12345'.
#'
#' @param path.2.dna.fasta The valid file path to the FASTA file to read in.
#' @param sanitize.names If TRUE, the default, gene descriptions following the
#' first blank after the gene accessions are discarded.
#'
#' @export
#' @return An instance of base::list as returned by \code{seqinr::read.fasta}
loadDnaFasta <- function(path.2.dna.fasta, sanitize.names = TRUE) {
dna.lst <- read.fasta(file = path.2.dna.fasta, seqtype = "DNA", as.string = TRUE,
strip.desc = TRUE)
if (sanitize.names)
names(dna.lst) <- sub("\\s+.*$", "", names(dna.lst))
dna.lst
}
#' Uses pairwise sequence similarity search results in order to identify which
#' gene is the closest homolog of the argument \code{gene.id}. Self-Matches are
#' excluded and as measure of sequence similarity the Bit-Score is used.
#'
#' @param gene.id The accession of the gene for which to find it's closest
#' homolog.
#' @param seq.sim.tbl An instance of \code{base::data.frame} holding the
#' tabular output of the pairwise sequence similarity searches. Could be
#' generated by using BLAT or Blast. Default is \code{if
#' (exists('all.vs.all.sim')) all.vs.all.sim}.
#' @param gene.col.a The column-name of \code{seq.sim.tbl} in which to find
#' argument \code{gene.id}. MUST be a string not a number. Default is
#' \code{'V1'}
#' @param gene.col.b The column-name of \code{seq.sim.tbl} in which to find
#' gene accessions ('hits') of significant similarity to the argument gene
#' \code{gene.id}. MUST be a string not a number. Default is \code{'V2'}
#' @param bit.score.col The column name of \code{seq.sim.tbl} in which to find
#' the numeric Bit-Scores of the pairwise local sequence alignments, generated
#' by Blast or BLAT e.g. MUST be a name not a number. Default is \code{'V12'}.
#'
#' @export
#' @return The gene identifier (accession) of that gene that has the highest
#' Bit-Score in the pairwise comparisons with the argument gene \code{gene.id}.
#' If no such gene can be found NA is returned.
closestHomolog <- function(gene.id, seq.sim.tbl = if (exists("all.vs.all.sim")) all.vs.all.sim,
gene.col.a = "V1", gene.col.b = "V2", bit.score.col = "V12") {
i <- which(seq.sim.tbl[, gene.col.a] == gene.id)
if (length(i) > 0) {
x <- seq.sim.tbl[i, c(gene.col.a, gene.col.b, bit.score.col)]
x <- x[which(x[, gene.col.a] != x[, gene.col.b]), ]
if (nrow(x) > 0)
x[base::order(x[, bit.score.col], decreasing = TRUE), gene.col.b][[1]] else NA
} else NA
}
#' Modifies the argument \code{gene.ids} by deleting expression variant
#' identifiers \emph{and} removing resulting duplicate gene IDs. See
#' \code{base::sub} for more details.
#'
#' @param gene.ids a character vector of gene identifiers.
#' @param reg.ex the regular expression used to identify the to be deleted
#' expression variant part of a gene ID. Default is
#' \code{getOption('GeneFamilies.expression.variant.regex', '\\.\\d+$')}.
#'
#' @export
#' @return The modified gene IDs with the expression variant parts deleted.
removeExpressionVariant <- function(gene.ids, reg.ex = getOption("GeneFamilies.expression.variant.regex",
"\\.\\d+$")) {
unique(sub(reg.ex, "", gene.ids))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.