R/family_funks.R

Defines functions removeExpressionVariant closestHomolog loadDnaFasta mclDataFrameAsList readMclOutput testAnnotationBasedSpeciesSpecificNeofunctionalization computeOrthologSpecificAnnotationDiversityPerGroup computeSpeciesSpecificAnnotationDiversityPerGroup geneGroupsAnnotationBasedShannonEntropy annotationBasedShannonEntropy compositeAnnotations extractFamilyName familyExpanded parseCafeBranchSpecificPValues getSpeciesNodeNumberFromCafe speciesCompositionsToDataFrame speciesComposition testShannonEntropy shannonEntropy

Documented in annotationBasedShannonEntropy closestHomolog compositeAnnotations computeOrthologSpecificAnnotationDiversityPerGroup computeSpeciesSpecificAnnotationDiversityPerGroup extractFamilyName familyExpanded geneGroupsAnnotationBasedShannonEntropy getSpeciesNodeNumberFromCafe loadDnaFasta mclDataFrameAsList parseCafeBranchSpecificPValues readMclOutput removeExpressionVariant shannonEntropy speciesComposition speciesCompositionsToDataFrame testAnnotationBasedSpeciesSpecificNeofunctionalization testShannonEntropy

#' Computes the normalized empirical Shannon Entropy for counts delivered in
#' the argument \code{counts.table}. Basis of the \code{log} function is
#' natural and normalization is done by division by the maximum entropy
#' \code{log(length(counts.table))} (see
#' \href{https://en.wikipedia.org/wiki/Entropy_(information_theory)#Efficiency)}{Normalized
#' Entropy}.
#'
#' @param counts.table result of invoking \code{base::table(count.vector)}
#' @param log.base The base of the logarithm. Default is the natural logarithm,
#' \code{getOption('GeneFamilies.entropy.log.base', base::exp(1))}.
#'
#' @export
#' @return A numeric value element [0,1], the normalized Shannon Entropy.
shannonEntropy <- function(counts.table, log.base = getOption("GeneFamilies.entropy.log.base", 
    base::exp(1))) {
    if (length(counts.table) <= 1) 
        return(0)
    c.t.s <- sum(counts.table)
    -sum(sapply(counts.table, function(x) x/c.t.s * log(x/c.t.s, base = log.base)))/log(length(counts.table), 
        base = log.base)
}

#' Testing function \code{GeneFamilies::shannonEntropy}
#'
#' @export
#' @return \code{TRUE} if and only if all tests are passed successfully.
testShannonEntropy <- function() {
    test.1 <- identical(shannonEntropy(table(c())), 0)
    test.2 <- identical(shannonEntropy(table(NULL)), 0)
    test.3 <- identical(shannonEntropy(table(NA)), 0)
    test.4 <- identical(shannonEntropy(table(1)), 0)
    test.5 <- identical(shannonEntropy(table(1:2)), 1)
    all(c(test.1, test.2, test.3, test.4, test.5))
}

#' Infers the number of genes belonging to the given species.
#'
#' @param 'fam' a character vector of gene identifiers representing the
#' argument gene family.
#' @param 'spec.cds.ids' a named list of character vectors indicating which
#' gene identifiers belong to which species. Default is set to constant
#' 'spec.gene.ids' declared in R/zzz.R
#'
#' @return An integer vector with names equeal to the names of argument list
#' 'spec.cds.ids' and values indicating how many gene members were found in the
#' respective species.
#' @export
speciesComposition <- function(fam, spec.cds.ids = spec.gene.ids) {
    spec.cnts <- setNames(as.integer(rep(0, (length(spec.cds.ids) + 1))), 
        c(names(spec.cds.ids), "fam.size"))
    for (spec in names(spec.cds.ids)) {
        x <- length(intersect(fam, spec.cds.ids[[spec]]))
        spec.cnts[[spec]] <- x
        if (sum(spec.cnts) == length(fam)) 
            break
    }
    spec.cnts[["fam.size"]] <- length(fam)
    spec.cnts
}

#' Generate a data frame of the list of species composition integer vectors.
#' Rows are named as families and columns as species. 
#'
#' @param spec.comps.lst - A list of integer vectors generated by lapply
#' function 'speciesComposition' on the gene families character vectors.
#' @param column.names - A character vector of column names. Default is the
#' names slot of the first entry of 'spec.comps.lst'
#' @param family.names - A character vector of family names. Default is the
#' names slot of argument list 'spec.comps.lst'
#'
#' @return A data frame: One row per family and one column per species with an
#' extra column of the family sizes.
#' @export
speciesCompositionsToDataFrame <- function(spec.comps.lst, column.names = names(spec.comps.lst[[1]]), 
    family.names = names(spec.comps.lst)) {
    as.data.frame(matrix(unlist(spec.comps.lst), byrow = TRUE, ncol = length(column.names), 
        nrow = length(spec.comps.lst), dimnames = list(family.names, column.names)), 
        stringsAsFactors = FALSE)
}

#' Extracts the number CAFE assigned the species during its analysis.
#'
#' @param cafe.id.line the string representing the line number three in the
#' cafe result file
#' @param species.name the species name as given in 'cafe.id.line' to extract
#' the CAFE internal number for
#'
#' @return The CAFE internal node number representing the species' branch in
#' the phylogeny. Returned as string.
#' @export
getSpeciesNodeNumberFromCafe <- function(cafe.id.line, species.name = "chi") {
    regmatches(cafe.id.line, regexec(paste(species.name, "<(\\d+)>", sep = ""), 
        cafe.id.line))[[1]][[2]]
}

#' Extract the branch specific extension / contraction P-Values that were
#' assigned to those branches that lead to the species' tips in the phylogeny.
#'
#' @param cafe.br.spec.pvals the string parsed out from the CAFE result table
#' @param spec.pos named integer vector holding the positions of the species
#' specific p-values in 'cafe.br.spec.pvals'. See ./exec/parseCafeResult.R for
#' more details
#'
#' @export
#' @return A data frame with columns the species and cells their specific
#' P-Values for expansion or contraction in the respective gene family.
parseCafeBranchSpecificPValues <- function(cafe.br.spec.pvals, spec.pos) {
    cafe.res <- strsplit(gsub("[()\\s]", "", cafe.br.spec.pvals, perl = TRUE), 
        ",")[[1]]
    #' Do not warn about the intent to make a numeric NA from the CAFE '-'
    #' character:
    suppressWarnings({
        data.frame(lapply(spec.pos, function(x) cafe.res[[x]]), stringsAsFactors = FALSE)
    })
}

#' Alternative method to detect expanded or compressed families comparing one
#' species with the background of the others. P-Values are obtained from an
#' outlier chisquare test; see package outliers::chisq.out.test(...) for
#' details.
#'
#' @param x named integer vector representing the gene family to test. x holds
#' the number of gene members per species. 
#' @param test.species a string being used ase a name in argument 'x'
#' indicating which species is the alternative hypothesis of expansion (or
#' contraction) based upon
#' @param background.species a character vector holding the names of the
#' species the background distribution of numbers of the species specific
#' family members is based upon.
#' @param expansion.test boolean indicating whether to test for expansion
#' (default TRUE) or compression (set to FALSE).
#'
#' @return numeric p-value obtained from outliers::chisq.out.test(x) or NA, if
#' expansion-test does not indicate a candidate outlier.
#' @import outliers
#' @export
familyExpanded <- function(x, test.species = "chi", background.species = setdiff(names(x), 
    test.species), expansion.test = max) {
    if (expansion.test && x[test.species] > max(x[background.species]) || 
        !expansion.test && x[test.species] < min(x[background.species])) {
        chisq.out.test(x)$p.value
    } else NA
}

#' Given a character vector regular expressions are applied to extract
#' substrings of the type 'fam1234'.
#'
#' @param char.vec A vector of strings each containing a family name, e.g. file
#' paths like 'families/fam1234/fam1234_ml_tree.newick'
#'
#' @return a character vector of extracted family names
#' @export
extractFamilyName <- function(char.vec) {
    regmatches(char.vec, regexpr("(fam\\d+)", char.vec, perl = TRUE))
}

#' Generates a list of composite Annotations found for the argument genes based
#' on the argument gene (function) annotation data.frame. Composite annotations
#' are sorted alphabetically and then concatonated with ',' as separator.
#'
#' @param gene.accs The identifiers or accessions of the genes to compute the
#' entropy for
#' @param gene.annos The data.frame holding the annotations for the genes in
#' 'gene.accs'. Default is all available InterPro annotations expected to be
#' found in 'all.ipr'
#' @param gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param anno.col the column of 'gene.annos' in which to lookup the function
#' annotation for the genes in 'gene.accs'. Default is 2
#'
#' @export
#' @return An character vector with names being the gene identifiers
#' 'gene.accs' and values the composite (function) annotations.
compositeAnnotations <- function(gene.accs, gene.annos = all.ipr, gene.col = 1, 
    anno.col = 2) {
    setNames(unlist(lapply(gene.accs, function(x) {
        if (x %in% gene.annos[, gene.col]) {
            paste(sort(gene.annos[which(gene.annos[, gene.col] == x), anno.col]), 
                collapse = ",")
        } else NA
    })), gene.accs)
}

#' Compute Shannon Entropy (\code{\link[entropy]{entropy}}) as a measure of
#' annotation (function) diversity within a given set of genes. If gene(s) have
#' multiple annotations these are sorted and concatonated in order to treat
#' them as single composite annotations. This means, that e.g if all genes have
#' the same three annotations the resulting entropy will be 0.
#'
#' @param gene.accs The identifiers or accessions of the genes to compute the
#' entropy for
#' @param gene.annos The data.frame holding the annotations for the genes in
#' 'gene.accs'. Default is all available InterPro annotations expected to be
#' found in 'all.ipr'
#' @param gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param anno.col the column of 'gene.annos' in which to lookup the function
#' annotation for the genes in 'gene.accs'. Default is 2
#' @param entropy.funk The function to be used to compute the Shannon Entropy.
#' Default is \code{getOption( 'GeneFamilies.entropy.function',
#' entropy::entropy )}. Set to \code{GeneFamilies::shannonEntropy} if you want
#' normalized Shannon Entropy. See
#' \href{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4696313/}{Entropy
#' Explorer R package}.
#'
#' @export
#' @return A list with two entries: 'entropy' is the computed shannon entropy,
#' and 'gene.annos.freqs' is an instance of table showing the annotation
#' architectures and their frequencies
annotationBasedShannonEntropy <- function(gene.accs, gene.annos = all.ipr, 
    gene.col = 1, anno.col = 2, entropy.funk = getOption("GeneFamilies.entropy.function", 
        entropy::entropy)) {
    c.a <- compositeAnnotations(gene.accs, gene.annos, gene.col, anno.col)
    c.a.i <- !is.na(c.a)
    if (any(c.a.i)) {
        gene.annos.freqs <- table(c.a[c.a.i])
        list(gene.annos.freqs = gene.annos.freqs, entropy = entropy.funk(gene.annos.freqs))
    } else list(gene.annos.freqs = NA, entropy = NA)
}

#' Computes for each gene group in \code{groups.ids} the annotation based
#' Shannon Entropy in parallel. Set \code{options(mc.cores=detectCores())} for
#' maximum speed.
#'
#' @param groups.ids The IDs of the gene groups to compute the annotation based
#' Shannon Entropy for.
#' @param groups.2.genes.lst A list with names intersecting with argument
#' \code{groups.ids} and values character vectors holding gene-IDs found in
#' \code{gene.annos}.
#' @param gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param anno.col the column of 'gene.annos' in which to lookup the function
#' annotation for the genes in 'gene.accs'. Default is 2
#'
#' @export
#' @return A list with names \code{groups.ids} and values the computed
#' annotation based Shannon Entropies.
geneGroupsAnnotationBasedShannonEntropy <- function(groups.ids, groups.2.genes.lst, 
    gene.annos = all.ipr, gene.col = 1, anno.col = 2) {
    setNames(mclapply(groups.ids, function(x) {
        annotationBasedShannonEntropy(groups.2.genes.lst[[x]], gene.annos, 
            gene.col, anno.col)$entropy
    }), groups.ids)
}

#' Computes the annotation (function) based Shannon Entropy per gene group
#' separate for each species. Note it is highly recommended to set
#' \code{options(mc.cores=detectCores())}
#'
#' @param groups.df An instance of base::data.frame with at least three
#' columns. One must hold the gene group identifiers, another the species
#' names, and another the gene identifiers (accessions) belonging to the
#' respective group and species.
#' @param group.col The column name of 'groups.df' in which to find the gene
#' group identifiers. Default is 'Tandem.Cluster'
#' @param species.col The column name of 'groups.df' in which to find the
#' species names. Must intersect with 'species' argument. Default is 'Species'.
#' @param gene.col The column name of 'groups.df' in which to find the
#' gene identifiers (accessions). Default is 'Gene'.
#' @param group.ids all group identifiers for which to compute the species
#' specific annotation based Shannon Entropy. Default is all distinct gene
#' groups found in 'groups.df'.
#' @param species a character vector of species names present in 'groups.df'
#' 'species.col'. Species not present in this argument will not be considered.
#' Default is all distinct species names found in 'groups.df'.
#' @param gene.annos The data.frame holding the annotations for the genes in
#' 'gene.accs'. Default is all available InterPro annotations expected to be
#' found in 'all.ipr'
#' @param annos.gene.col the column of 'gene.annos' in which to lookup the gene
#' identifiers or gene accessions. Default is 1
#' @param annos.anno.col the column of 'gene.annos' in which to lookup the
#' function annotation for the genes in 'gene.accs'. Default is 2
#'
#' @export
#' @return An instance of base::data.frame with several columns. The first
#' holds the gene group identifiers obtained from 'groups.df' and one numeric
#' column for each species found in 'species'. The later columns hold the
#' species specific annotation entropies for the respective gene groups; NA
#' values where no genes of that species were found to be members of the
#' respective gene group.
computeSpeciesSpecificAnnotationDiversityPerGroup <- function(groups.df, 
    group.col = "Tandem.Cluster", species.col = "Species", gene.col = "Gene", 
    group.ids = unique(groups.df[, group.col]), species = unique(groups.df[, 
        species.col]), gene.annos = all.ipr, annos.gene.col = 1, annos.anno.col = 2) {
    res.df <- as.data.frame(do.call("rbind", mclapply(group.ids, function(x) {
        clstr <- groups.df[which(groups.df[, group.col] == x), ]
        clstr.spec.entrp <- lapply(species, function(y) {
            if (y %in% clstr[, species.col]) {
                annotationBasedShannonEntropy(clstr[which(clstr[, species.col] == 
                  y), gene.col], gene.annos = gene.annos, gene.col = annos.gene.col, 
                  anno.col = annos.anno.col)$entropy
            } else NA
        })
        setNames(as.list(unlist(c(x, clstr.spec.entrp))), c(group.col, 
            species))
    })), stringsAsFactors = FALSE)
    for (i in species) {
        res.df[, i] <- as.numeric(res.df[, i])
    }
    res.df
}


#' Wrapper for function
#' \code{computeSpeciesSpecificAnnotationDiversityPerGroup}. Instead of species
#' affiliation the distinction here is wether genes are Orthologs or
#' Non-Orthologs.
#'
#' @param gene.groups.df An instance of base::data.frame with at least two
#' columns. One must hold the gene group identifiers and the other the gene
#' identifiers (accessions) belonging to the respective group.
#' @param group.col The column name of 'gene.groups.df' in which to find the
#' gene group identifiers. Default is 'Family'.
#' @param gene.col The column name of 'gene.groups.df' in which to find the
#' gene identifiers (accessions). Default is 'Gene'.
#'
#' @export
#' @return An instance of base::data.frame with several columns. The first
#' holds the gene group identifiers obtained from 'gene.groups.df' and one
#' numeric column for each \code{Orthologs} and \code{Non-Orthologs}. The later
#' columns hold the gene-class specific annotation entropies for the respective
#' gene groups; NA values where no genes of that species were found to be
#' members of the respective gene group.
computeOrthologSpecificAnnotationDiversityPerGroup <- function(gene.groups.df, 
    orthologs = orthologs.genes, group.col = "Family", gene.col = "Gene") {
    gene.groups.extended.df <- gene.groups.df
    gene.groups.extended.df$Species <- as.character(unlist(mclapply(gene.groups.extended.df[, 
        gene.col], function(x) if (x %in% orthologs) "Orthologs" else "Non-Orthologs")))
    computeSpeciesSpecificAnnotationDiversityPerGroup(gene.groups.extended.df, 
        group.col = group.col, gene.col = gene.col)
}


#' Statistically tests the NULL Hypotheses that the annotation (function) based
#' species specific Shannon Entropy found in the gene groups for
#' 'spec.2.compare' is not greater than that of the other species. The
#' population means are compared using a t.test and the whole distributions are
#' compared using a Kolmogoroff Smirnoff test. Correction for multiple
#' hypotheses testing is carried out using the 'BY' method (see base::p.adjust
#' for more details). 
#'
#' @param entropy.df The result of calling function
#' \code{computeSpeciesSpecificAnnotationDiversityPerGroup(...)}
#' @param spec.2.compare The species which to compare to all others.
#' @param other.specs The species to compare pairwisely with
#' \code{spec.2.compare}
#'
#' @export
#' @return An instance of base::data.frame with 4 columns. The first two
#' indicate which two species are compared in the respective row, the 3rd
#' column holds the P-Values obtained from the pairwise T-Test, and the 4th
#' column the P-Value obtained from the KS-Test. All P-Values are corrected for
#' multiple hypotheses testing.
testAnnotationBasedSpeciesSpecificNeofunctionalization <- function(entropy.df, 
    spec.2.compare, other.specs) {
    spec.vs.others = expand.grid(spec.2.compare, other.specs)
    spec.vs.others$t.test.greater <- p.adjust(as.numeric(unlist(apply(spec.vs.others, 
        1, function(x) {
            t.test(entropy.df[, x[[1]]], entropy.df[, x[[2]]], alternative = "greater")$p.value
        }))), method = "BY")
    spec.vs.others$KS.test.less <- p.adjust(as.numeric(unlist(apply(spec.vs.others, 
        1, function(x) {
            ks.test(entropy.df[, x[[1]]], entropy.df[, x[[2]]], alternative = "less")$p.value
        }))), method = "BY")
    spec.vs.others
}

#' Uses Linux's AWK and package data.table to parse the output of MCL into an
#' instance of base::data.frame.
#'
#' @param path.2.mcl.out The valid path to the output file of the \code{mcl}
#' program.
#' @param family.name.prefix The prefix used to name gene families (clusters).
#' Default is \code{'cluster_'} resulting in 'cluster_1', 'cluster_2', etc.
#'
#' @export
#' @return A data.frame with two columns: 'Family', and 'Gene' mapping the
#' families to their gene members.
readMclOutput <- function(path.2.mcl.out, family.name.prefix = "cluster_") {
    sys.cmd <- paste("awk '{split($0,a,\"\\t\"); for (i in a) { print \"", 
        family.name.prefix, "\" NR \"\\t\" a[i] }}' ", path.2.mcl.out, 
        sep = "")
    fread(sys.cmd, stringsAsFactors = FALSE, sep = "\t", colClasses = rep("character", 
        2), col.names = c("Family", "Gene"), na.strings = "", data.table = FALSE)
}

#' Converts the data.frame of gene families into a list of gene families.
#'
#' @param mcl.df A data frame with two columns 'Family' and 'Gene' mapping the
#' families to their gene members. See \code{readMclOutput} for more details.
#'
#' @export
#' @return A list with names the families in argument \code{mcl.df} and values
#' the respective gene members joined into sub-lists.
mclDataFrameAsList <- function(mcl.df) {
    fams <- unique(mcl.df$Family)
    setNames(mclapply(fams, function(x) as.list(unlist(mcl.df[which(mcl.df$Family == 
        x), "Gene"]))), fams)
}

#' Simple wrapper around seqinr::read.fasta for DNA data. Function deletes gene
#' descriptions following the first blank character after the gene accession.
#' For example the name '>AT1G12345 crazy horse gymnast gene' becomes
#' 'AT1G12345'.
#'
#' @param path.2.dna.fasta The valid file path to the FASTA file to read in.
#' @param sanitize.names If TRUE, the default, gene descriptions following the
#' first blank after the gene accessions are discarded.
#'
#' @export
#' @return An instance of base::list as returned by \code{seqinr::read.fasta}
loadDnaFasta <- function(path.2.dna.fasta, sanitize.names = TRUE) {
    dna.lst <- read.fasta(file = path.2.dna.fasta, seqtype = "DNA", as.string = TRUE, 
        strip.desc = TRUE)
    if (sanitize.names) 
        names(dna.lst) <- sub("\\s+.*$", "", names(dna.lst))
    dna.lst
}

#' Uses pairwise sequence similarity search results in order to identify which
#' gene is the closest homolog of the argument \code{gene.id}. Self-Matches are
#' excluded and as measure of sequence similarity the Bit-Score is used.
#'
#' @param gene.id The accession of the gene for which to find it's closest
#' homolog.
#' @param seq.sim.tbl An instance of \code{base::data.frame} holding the
#' tabular output of the pairwise sequence similarity searches. Could be
#' generated by using BLAT or Blast. Default is \code{if
#' (exists('all.vs.all.sim')) all.vs.all.sim}.
#' @param gene.col.a The column-name of \code{seq.sim.tbl} in which to find
#' argument \code{gene.id}. MUST be a string not a number. Default is
#' \code{'V1'}
#' @param gene.col.b The column-name of \code{seq.sim.tbl} in which to find
#' gene accessions ('hits') of significant similarity to the argument gene
#' \code{gene.id}. MUST be a string not a number. Default is \code{'V2'}
#' @param bit.score.col The column name of \code{seq.sim.tbl} in which to find
#' the numeric Bit-Scores of the pairwise local sequence alignments, generated
#' by Blast or BLAT e.g. MUST be a name not a number. Default is \code{'V12'}.
#'
#' @export
#' @return The gene identifier (accession) of that gene that has the highest
#' Bit-Score in the pairwise comparisons with the argument gene \code{gene.id}.
#' If no such gene can be found NA is returned.
closestHomolog <- function(gene.id, seq.sim.tbl = if (exists("all.vs.all.sim")) all.vs.all.sim, 
    gene.col.a = "V1", gene.col.b = "V2", bit.score.col = "V12") {
    i <- which(seq.sim.tbl[, gene.col.a] == gene.id)
    if (length(i) > 0) {
        x <- seq.sim.tbl[i, c(gene.col.a, gene.col.b, bit.score.col)]
        x <- x[which(x[, gene.col.a] != x[, gene.col.b]), ]
        if (nrow(x) > 0) 
            x[base::order(x[, bit.score.col], decreasing = TRUE), gene.col.b][[1]] else NA
    } else NA
}


#' Modifies the argument \code{gene.ids} by deleting expression variant
#' identifiers \emph{and} removing resulting duplicate gene IDs. See
#' \code{base::sub} for more details.
#'
#' @param gene.ids a character vector of gene identifiers.
#' @param reg.ex the regular expression used to identify the to be deleted
#' expression variant part of a gene ID. Default is
#' \code{getOption('GeneFamilies.expression.variant.regex', '\\.\\d+$')}.
#'
#' @export
#' @return The modified gene IDs with the expression variant parts deleted.
removeExpressionVariant <- function(gene.ids, reg.ex = getOption("GeneFamilies.expression.variant.regex", 
    "\\.\\d+$")) {
    unique(sub(reg.ex, "", gene.ids))
}
asishallab/GeneFamilies documentation built on May 22, 2023, 11:30 a.m.