R/parsePhyloProfile.R
In PhyloProfile: PhyloProfile

Documented in calcPresSpec filterProfileData fromInputToProfile getInputTaxaID getInputTaxaName getNameList getSelectedTaxonNames getTaxonomyMatrix getTaxonomyRanks parseInfoProfile reduceProfile sortInputTaxa

# Functions for parsing and pre-processing PhyloProfile input

#' Create a list containing all main taxanomy ranks
#' @export
#' @return A list of all main ranks (from strain to superkingdom)
#' @author Carla Mölbert (carla.moelbert@gmx.de)
#' @examples
#' getTaxonomyRanks()

getTaxonomyRanks <- function(){
    allRanks <- list(
        "Strain " = "strain",
        "Species" = "species",
        "Genus" = "genus",
        "Family" = "family",
        "Order" = "order",
        "Class" = "class",
        "Phylum" = "phylum",
        "Kingdom" = "kingdom",
        "Superkingdom" = "superkingdom",
        "unselected" = ""
    )
    return(allRanks)
}

#' Get list of pre-installed NCBI taxon names
#' @description Get all NCBI taxon names from
#' "PhyloProfile/data/taxonNamesReduced.txt"
#' @export
#' @return List of taxon IDs, their full names, taxonomy ranks and parent IDs
#' obtained from "PhyloProfile/data/taxonNamesReduced.txt"
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @examples
#' getNameList()

getNameList <- function() {
    nameReducedFile <- paste(
        system.file(package="PhyloProfile"),
        "PhyloProfile/data/taxonNamesReduced.txt",
        sep="/"
    )
    
    if (!file.exists(nameReducedFile)) {
        utils::data(taxonNamesReduced)
    } else {
        taxonNamesReduced <- utils::read.table(
            nameReducedFile, sep = "\t", header = TRUE, fill = TRUE
        )
    }
    
    taxonNamesReduced$fullName <- as.character(taxonNamesReduced$fullName)
    taxonNamesReduced$rank <- as.character(taxonNamesReduced$rank)
    taxonNamesReduced <- taxonNamesReduced[!duplicated(taxonNamesReduced), ]
    
    return(taxonNamesReduced)
}

#' Get taxonomy matrix
#' @description Get the (full or subset) taxonomy matrix from
#' "data/taxonomyMatrix.txt" based on an input taxon list
#' @export
#' @param subsetTaxaCheck TRUE/FALSE subset taxonomy matrix based on input taxon
#' IDs. Default = FALSE.
#' @param taxonIDs list of input taxon IDs (e.g. ncbi1234). Default = NULL.
#' @return Data frame contains the (subset of) taxonomy matrix for list of
#' input taxa.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @examples
#' # get full pre-installed taxonomy matrix
#' getTaxonomyMatrix(FALSE, NULL)
#' # get taxonomy matrix for a list of taxon IDs
#' taxonIDs <- c("ncbi9606", "ncbi10116")
#' getTaxonomyMatrix(TRUE, taxonIDs)

getTaxonomyMatrix <- function(subsetTaxaCheck = FALSE, taxonIDs = NULL){
    taxonomyMatrixFile <- paste(
        system.file(package="PhyloProfile"),
        "PhyloProfile/data/taxonomyMatrix.txt",
        sep="/"
    )
    
    if (!file.exists(taxonomyMatrixFile)) {
        utils::data(taxonomyMatrix)
    } else {
        taxonomyMatrix <- utils::read.table(
            taxonomyMatrixFile, sep = "\t", header = TRUE,
            stringsAsFactors = TRUE
        )
    }
    
    if (subsetTaxaCheck) {
        if (missing(taxonIDs)) return(taxonomyMatrix)
        taxonomyMatrix <- taxonomyMatrix[
            taxonomyMatrix$abbrName  %in% taxonIDs, ]
    }
    return(taxonomyMatrix)
}

#' Get ID list of input taxa from the main input
#' @param rawProfile A dataframe of input phylogenetic profile in long format
#' @return List of all input taxon IDs (e.g. ncbi1234). Default = NULL.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{mainLongRaw}}
#' @examples
#' data("mainLongRaw", package="PhyloProfile")
#' getInputTaxaID(mainLongRaw)

getInputTaxaID <- function(rawProfile = NULL){
    if (is.null(rawProfile)) stop("Input profile data cannot be NULL!")
    inputTaxa <- levels(as.factor(rawProfile$ncbiID))
    inputTaxa <- unlist(strsplit(inputTaxa, split = "\t"))
    # remove "geneID" element from vector inputTaxa
    if (inputTaxa[1] == "geneID") inputTaxa <- inputTaxa[-1]
    # return input taxa
    return(inputTaxa)
}

#' Get NCBI taxon names for a selected list of taxa
#' @description Get NCBI taxon names from
#' "PhyloProfile/data/taxonNamesReduced.txt" for a list of input taxa
#' @param rankName taxonomy rank (e.g. "species","phylum",...)
#' @param taxonIDs list of taxon IDs (e.g. ncbi1234). Default = NULL
#' @return Data frame contains a list of full names, taxonomy ranks and parent
#' IDs for the input taxa.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{getInputTaxaID}} for getting input taxon IDs,
#' \code{\link{getNameList}} for getting the full taxon name list
#' @examples
#' taxonIDs <- c("ncbi9606", "ncbi10116")
#' getInputTaxaName("species", taxonIDs)

getInputTaxaName <- function(rankName, taxonIDs = NULL){
    # check input parameters
    if (missing(rankName)) return("No taxonomy rank name given!")
    allMainRanks <- getTaxonomyRanks()
    if (!(rankName[1] %in% allMainRanks)) return("Invalid taxonomy rank given!")
    # load list of unsorted taxa
    Dt <- getTaxonomyMatrix(TRUE, taxonIDs)
    # load list of taxon name
    nameList <- getNameList()
    # return
    choice <- data.frame(
        "ncbiID" = unlist(Dt[rankName]), stringsAsFactors = FALSE
    )
    choice <- merge(choice, nameList, by = "ncbiID", all = FALSE)
    return(choice)
}

#' Get a subset of input taxa based on a selected taxonomy rank
#' @description Get a subset of taxon ncbi IDs and names from an input list of
#' taxa based on a selected supertaxon (identified by its taxonomy rank and
#' supertaxon name or supertaxon ID).
#' @usage getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, higherID,
#'     higherName)
#' @param inputTaxonIDs list of input taxon IDs (e.g. c("10116", "122586"))
#' @param rank taxonomy rank of input taxa (e.g. "species")
#' @param higherRank selected taxonomy rank (e.g. "phylum")
#' @param higherID supertaxon ID (e.g. 7711). NOTE: either supertaxon ID or
#' name is required, not neccessary to give both.
#' @param higherName supertaxon name (e.g. "Chordata"). NOTE: either
#' supertaxon ID or name is required, not neccessary to give both.
#' @export
#' @return A data frame contains ncbi IDs and names of taxa from the input taxon
#' list that belong to the selected supertaxon.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @examples
#' inputTaxonIDs <- c("10116", "122586", "123851", "13616", "188937", "189518",
#' "208964", "224129", "224324", "237631", "243230")
#' rank <- "species"
#' higherRank <- "phylum"
#' higherID <- 7711
#' getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, higherID, NULL)
#' higherName <- "Chordata"
#' getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, NULL, higherName)

getSelectedTaxonNames <- function(
    inputTaxonIDs = NULL, rank = NULL,
    higherRank = NULL, higherID = NULL, higherName = NULL
) {
    rankName <- NULL
    if (is.null(inputTaxonIDs) | is.null(rank))
        stop("Input taxa and taxonomy rank cannot be NULL!")
    taxDf <- getTaxonomyMatrix(TRUE, paste0("ncbi", inputTaxonIDs))
    if (is.null(higherID) & is.null(higherName))
        return(
            data.frame(
                ncbiID = taxDf$ncbiID[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                name = taxDf$fullName[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                stringsAsFactors = FALSE))
    if (is.null(higherRank)) {
        return(
            data.frame(
                ncbiID = taxDf$ncbiID[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                name = taxDf$fullName[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                stringsAsFactors = FALSE))
    } else {
        if (!is.null(higherName) & is.null(higherID)) {
            taxaList <- getNameList()
            superID <- taxaList$ncbiID[
                taxaList$fullName == higherName
                & taxaList$rank %in% c(higherRank, "norank")]
            customizedtaxaID <- levels(
                as.factor(taxDf[rank][taxDf[higherRank] == superID, ]))
            return(
                data.frame(
                    ncbiID = taxaList$ncbiID[
                        taxaList$rank %in% c(rank, "norank")
                        & taxaList$ncbiID %in% customizedtaxaID],
                    name = taxaList$fullName[
                        taxaList$rank %in% c(rank, "norank")
                        & taxaList$ncbiID %in% customizedtaxaID],
                    stringsAsFactors = FALSE))
        } else if (!is.null(higherID)) {
            return(
                data.frame(
                    ncbiID = taxDf$ncbiID[taxDf[,higherRank] == higherID],
                    name = taxDf$fullName[taxDf[,higherRank] == higherID],
                    stringsAsFactors = FALSE))
        }
    }
}

#' Sort list of (super)taxa based on a selected reference (super)taxon
#' @usage sortInputTaxa(taxonIDs = NULL, rankName, refTaxon = NULL,
#'     taxaTree = NULL)
#' @param taxonIDs list of taxon IDs (e.g.: ncbi1234, ncbi9999, ...). Default =
#' NULL.
#' @param rankName working taxonomy rank (e.g. "species", "phylum",...)
#' @param refTaxon selected reference taxon. Default = NULL.
#' @param taxaTree taxonomy tree for the input taxa (optional). Default = NULL.
#' @return A taxonomy matrix for the input taxa ordered by the selected
#' reference taxon. This matrix is sorted either based on the NCBI taxonomy
#' info, or based on an user-defined taxonomy tree (if provided).
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{getNameList}}, \code{\link{getTaxonomyMatrix}},
#' \code{\link{createRootedTree}}, \code{\link{sortTaxaFromTree}},
#' \code{\link{getInputTaxaName}}, \code{\link{getInputTaxaID}},
#' \code{\link{createLongMatrix}}
#' @examples
#' taxonIDs <- c(
#'     "ncbi10116", "ncbi123851", "ncbi3702", "ncbi13616", "ncbi9606"
#' )
#' sortInputTaxa(taxonIDs, "species", "Homo sapiens", NULL)

sortInputTaxa <- function(
    taxonIDs = NULL, rankName, refTaxon = NULL, taxaTree = NULL
){
    ncbiID <- fullName <- abbrName <- NULL
    if (missing(rankName)) return("No taxonomy rank name given!")
    allMainRanks <- getTaxonomyRanks()
    if (!(rankName[1] %in% allMainRanks)) return("Invalid taxonomy rank given!")
    if (is.null(refTaxon))  refTaxon <- taxonNames$fullName[1]
    # get list of taxon names
    fullnameList <- getNameList()
    taxonNames <- getInputTaxaName(rankName, taxonIDs)
    # get selected supertaxon ID(s)
    rankNameTMP <- taxonNames$rank[taxonNames$fullName == refTaxon]
    if (rankName == "strain") {
        superID <- fullnameList$ncbiID[fullnameList$fullName == refTaxon]
    } else
        superID <- fullnameList$ncbiID[
            fullnameList$fullName == refTaxon
            & fullnameList$rank == rankNameTMP[1]]
    # get full taxonomy data & representative taxon
    Dt <- getTaxonomyMatrix()
    repTaxon <- Dt[Dt[, rankName] == superID, ][1, ]
    # THEN, SORT TAXON LIST BASED ON TAXONOMY TREE
    if (is.null(taxaTree)) {
        distDf <- subset(Dt, select = -c(ncbiID, fullName))
        row.names(distDf) <- distDf$abbrName
        distDf <- distDf[, -1]
        taxaTree <- createRootedTree(distDf, as.character(repTaxon$abbrName))
    } else
        taxaTree <- ape::root(
            taxaTree,outgroup=as.character(repTaxon$abbrName),resolve.root=TRUE)
    taxonList <- sortTaxaFromTree(taxaTree)
    sortedDt <- Dt[match(taxonList, Dt$abbrName), ]
    # subset to get list of input taxa only
    sortedDt <- subset(sortedDt, abbrName %in% taxonIDs)
    # get only taxonIDs list of selected rank and rename columns
    sortedOut <- subset(
        sortedDt,
        select = c("abbrName", "ncbiID", "fullName", as.character(rankName)))
    colnames(sortedOut) <- c("abbrName", "species", "fullName", "ncbiID")
    # add name of supertaxa into sortedOut list
    sortedOut <- merge(
        sortedOut, fullnameList, by = "ncbiID", all.x = TRUE, sort = FALSE)
    sortedOut$species <- paste0("ncbi", sortedOut$species)
    ## create new column for sorted supertaxon
    indexSpec <- unlist(lapply(
        seq_len(nlevels(as.factor(sortedOut$fullName.y))),function (x) 1000+x))
    indexSpecDf <- data.frame(
        fullName.y = unique(as.character(sortedOut$fullName.y)),
        sortedSupertaxon = paste0(
            indexSpec, "_", unique(as.character(sortedOut$fullName.y))
        ), stringsAsFactors = FALSE)
    sortedOut <- merge(indexSpecDf, sortedOut, by = "fullName.y")
    # final sorted supertaxa list
    sortedOut$taxonID <- 0
    sortedOut$category <- "cat"
    sortedOut <- sortedOut[, c(
        "abbrName", "taxonID", "fullName.x", "species", "ncbiID",
        "sortedSupertaxon", "rank", "category")]
    colnames(sortedOut) <- c(
        "abbrName", "taxonID", "fullName", "ncbiID", "supertaxonID",
        "supertaxon", "rank", "category")
    sortedOut$ncbiID <- as.factor(sortedOut$ncbiID)
    sortedOut$supertaxon <- as.factor(sortedOut$supertaxon)
    sortedOut$category <- as.factor(sortedOut$category)
    return(sortedOut)
}

#' Calculate percentage of present species in each super taxon
#' @export
#' @param profileWithTax data frame of main PhyloProfile input together
#' with their taxonomy info (see ?profileWithTaxonomy)
#' @param taxaCount number of species occur in each supertaxon (e.g. phylum
#' or kingdom)
#' @return A data frame with % of present species in each supertaxon
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @seealso \code{\link{profileWithTaxonomy}} for a demo input data
#' @examples
#' # NOTE: for internal testing only - not recommended for outside using
#' data("profileWithTaxonomy", package="PhyloProfile")
#' taxaCount <- plyr::count(profileWithTaxonomy, "supertaxon")
#' taxaCount$freq <- 1
#' calcPresSpec(profileWithTaxonomy, taxaCount)

calcPresSpec <- function(profileWithTax, taxaCount){
    paralog <- abbrName <- NULL
    if (missing(profileWithTax)) return("No input data given")
    if (missing(taxaCount)) return("No supertaxon count given")
    profileWithTax <- profileWithTax[profileWithTax$orthoID != "NA", ]
    
    # get geneID and supertaxon
    geneIDSupertaxon <- subset(
        profileWithTax,
        select = c("geneID", "supertaxon", "paralog", "abbrName")
    )
    # remove duplicated rows
    geneIDSupertaxon <- geneIDSupertaxon[!duplicated(geneIDSupertaxon), ]
    # remove NA rows from profileWithTax
    profileWithTaxNoNA <- profileWithTax[profileWithTax$orthoID != "NA", ]
    # count present frequency of supertaxon for each gene
    geneSupertaxonCount <- plyr::count(
        profileWithTaxNoNA, c("geneID", "supertaxon")
    )
    # merge with taxaCount to get total number of species of each supertaxon
    # and calculate presSpec
    presSpecDt <- merge(
        geneSupertaxonCount, taxaCount, by = "supertaxon", all.x = TRUE
    )
    specCount <- plyr::count(geneIDSupertaxon, c("geneID", "supertaxon"))
    presSpecDt <- merge(
        presSpecDt, specCount, by = c("geneID", "supertaxon")
    )
    presSpecDt$presSpec <- round(presSpecDt$freq / presSpecDt$freq.y, digits=2)
    presSpecDt <- presSpecDt[presSpecDt$presSpec <= 1, ]
    presSpecDt <- presSpecDt[order(presSpecDt$geneID), ]
    presSpecDt <- presSpecDt[, c("geneID", "supertaxon", "presSpec")]
    # add absent supertaxon into presSpecDt
    geneIDSupertaxon <- subset(
        geneIDSupertaxon, select = -c(paralog, abbrName)
    )
    finalPresSpecDt <- merge(
        presSpecDt, geneIDSupertaxon,
        by = c("geneID", "supertaxon"), all.y = TRUE
    )
    finalPresSpecDt$presSpec[is.na(finalPresSpecDt$presSpec)] <- 0
    # remove duplicated and NA rows
    finalPresSpecDt <- finalPresSpecDt[!duplicated(finalPresSpecDt), ]
    finalPresSpecDt <- finalPresSpecDt[stats::complete.cases(finalPresSpecDt), ]
    # return finalPresSpecDt
    return(finalPresSpecDt)
}

#' Parsing info for phylogenetic profiles
#' @description Creating main dataframe for the input phylogenetic profiles
#' based on selected input taxonomy level (e.g. strain, species) and reference
#' taxon. The output contains the number of paralogs, the max/min/mean/median 
#' of VAR1 and VAR2.
#' @usage parseInfoProfile(inputDf, sortedInputTaxa, taxaCount, coorthoCOMax)
#' @param inputDf input profiles in long format
#' @param sortedInputTaxa sorted taxonomy data for the input taxa
#' (check sortInputTaxa())
#' @param taxaCount dataframe counting present taxa in each supertaxon
#' @param coorthoCOMax maximum number of co-orthologs allowed
#' @return A dataframe contains all info for the input phylogenetic profiles.
#' This full processed profile that is required for several profiling analyses
#' e.g. estimation of gene age (?estimateGeneAge) or identification of core gene
#' (?getCoreGene).
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{sortInputTaxa}},
#' \code{\link{calcPresSpec}}, \code{\link{mainLongRaw}}
#' @examples
#' data("mainLongRaw", package="PhyloProfile")
#' taxonIDs <- getInputTaxaID(mainLongRaw)
#' sortedInputTaxa <- sortInputTaxa(
#'     taxonIDs, "class", "Mammalia", NULL
#' )
#' taxaCount <- plyr::count(sortedInputTaxa, "supertaxon")
#' coorthoCOMax <- 999
#' parseInfoProfile(
#'     mainLongRaw, sortedInputTaxa, taxaCount, coorthoCOMax
#' )

parseInfoProfile <- function(
    inputDf, sortedInputTaxa, taxaCount, coorthoCOMax = 9999
) {
    if (is.null(inputDf) | is.null(sortedInputTaxa))
        stop("Input profiles and sorted taxonomy data cannot be NULL!")
    if (ncol(inputDf) > 3) {
        if (ncol(inputDf) < 5) colnames(inputDf)[4] <- "var1"
        else colnames(inputDf)[c(4,5)] <- c("var1", "var2")
    }
    if (coorthoCOMax < 1) coorthoCOMax <- 1
    # count number of inparalogs & calculate frequency of all supertaxa
    paralogCount <- plyr::count(inputDf, c("geneID", "ncbiID"))
    inputDf <- merge(inputDf, paralogCount, by = c("geneID", "ncbiID"))
    colnames(inputDf)[ncol(inputDf)] <- "paralog"
    # filter by number of coorthologs
    inputDf <- inputDf[!(inputDf$paralog > coorthoCOMax),]
    # merge inputDf and sortedInputTaxa to get taxonomy info
    taxaMdData <- merge(inputDf, sortedInputTaxa, by = "ncbiID")
    # merge with taxaCount to get number of species
    fullMdData <- merge(taxaMdData, taxaCount, by = ("supertaxon"), all.x=TRUE)
    names(fullMdData)[names(fullMdData) == "freq"] <- "numberSpec"
    fullMdData$fullName <- as.vector(fullMdData$fullName)
    names(fullMdData)[names(fullMdData) == "orthoID.x"] <- "orthoID"
    fullMdData <- fullMdData[!duplicated(fullMdData), ]
    return(fullMdData)
}

#' Filter phylogentic profiles
#' @description Create a filtered data needed for plotting or clustering
#' phylogenetic profiles. NOTE: this function require some intermediate steps
#' using the results from other functions. If you would like to get a full
#' processed data from the raw input, please use the function
#' fromInputToProfile() instead!
#' @usage filterProfileData(DF, taxaCount, refTaxon = NULL,
#'     percentCO = c(0, 1), coorthoCOMax = 9999,
#'     var1CO  = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
#'     var2Rel = "protein", groupByCat = FALSE, catDt = NULL, 
#'     var1AggregateBy = "max", var2AggregateBy = "max")
#' @param DF a reduced dataframe contains info for all phylogenetic
#' profiles in the selected taxonomy rank.
#' @param taxaCount dataframe counting present taxa in each supertaxon
#' @param refTaxon selected reference taxon. NOTE: This taxon will not be
#' affected by the filtering. If you want to filter all, set refTaxon <- NULL.
#' Default = NULL.
#' @param percentCO min and max cutoffs for percentage of species present
#' in a supertaxon. Default = c(0, 1).
#' @param coorthoCOMax maximum number of co-orthologs allowed. Default =
#' 9999.
#' @param var1CO min and max cutoffs for var1. Default = c(0, 1).
#' @param var2CO min anc max cutoffs for var2. Default = c(0, 1).
#' @param var1Rel relation of var1 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param var2Rel relation of var2 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param groupByCat group genes by their categories (TRUE or FALSE). Default =
#' FALSE.
#' @param catDt dataframe contains gene categories
#' (optional, NULL if groupByCat = FALSE or no info provided). Default = NULL.
#' @param var1AggregateBy aggregate method for VAR1 (max, min, mean
#' or median), applied for calculating var1 of supertaxa. Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (max, min, mean
#' or median), applied for calculating var2 of supertaxa. Default = "max".
#' @return A filtered dataframe for generating profile plot including seed gene
#' IDs (or orthologous group IDs), their ortholog IDs and the corresponding
#' (super)taxa, (super)taxon IDs, number of co-orthologs in each (super)taxon,
#' values for two additional variables var1, var2, % of species present in each
#' supertaxon, and the categories of seed genes (or ortholog groups).
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{parseInfoProfile}} and \code{\link{reduceProfile}}
#' for generating input dataframe, \code{\link{fullProcessedProfile}} for a
#' demo full processed profile dataframe, \code{\link{fromInputToProfile}} for
#' generating fully processed data from raw input.
#' @examples
#' # NOTE: this function require some intermediate steps using the results from
#' # other functions. If you would like to get a full processed data from the
#' # raw input, please use the function fromInputToProfile() instead!
#' data("fullProcessedProfile", package="PhyloProfile")
#' rankName <- "class"
#' refTaxon <- "Mammalia"
#' percentCutoff <- c(0.0, 1.0)
#' coorthologCutoffMax <- 10
#' var1Cutoff <- c(0.75, 1.0)
#' var2Cutoff <- c(0.5, 1.0)
#' var1Relation <- "protein"
#' var2Relation <- "species"
#' groupByCat <- FALSE
#' catDt <- NULL
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "max"
#' taxonIDs <- levels(as.factor(fullProcessedProfile$ncbiID))
#' sortedInputTaxa <- sortInputTaxa(
#'     taxonIDs, rankName, refTaxon, NULL
#' )
#' taxaCount <- plyr::count(sortedInputTaxa, "supertaxon")
#' filterProfileData(
#'     fullProcessedProfile,
#'     taxaCount,
#'     refTaxon,
#'     percentCutoff,
#'     coorthologCutoffMax,
#'     var1Cutoff,
#'     var2Cutoff,
#'     var1Relation,
#'     var2Relation,
#'     groupByCat,
#'     catDt,
#'     var1AggregateBy,
#'     var2AggregateBy
#' )

filterProfileData <- function(
    DF, taxaCount, refTaxon = NULL, percentCO = c(0, 1), coorthoCOMax = 9999,
    var1CO = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
    var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
    var1AggregateBy = "max", var2AggregateBy = "max"
) {
    if (is.null(DF)) stop("Profile data cannot be NULL!")
    if (is.null(refTaxon)) refTaxon = "NA"
    ### check if working with lowest rank (species/strain), e.g. flag == 0
    flag <- 1
    if (length(unique(levels(as.factor(DF$numberSpec)))) == 1) {
        if (unique(levels(as.factor(DF$numberSpec))) == 1) {
            flag <- 0
        }
    }
    ### remove index from supertaxon names
    DF$taxonMod <- gsub("^[[:digit:]]*_", "", DF$supertaxon)
    ### filter by var1 and var2
    if (flag == 0) {
        if (var1Rel == "protein") {
            DF$var1[DF$taxonMod != refTaxon & DF$var1 < var1CO[1]] <- NA
            DF$var1[DF$taxonMod != refTaxon & DF$var1 > var1CO[2]] <- NA
        } else {
            DF$var1[DF$taxonMod != refTaxon & DF$var1 < var1CO[1]] <- 0
            DF$var1[DF$taxonMod != refTaxon & DF$var1 > var1CO[2]] <- 0
        }
        if (var2Rel == "protein") {
            DF$var2[DF$taxonMod != refTaxon & DF$var2 < var2CO[1]] <- NA
            DF$var2[DF$taxonMod != refTaxon & DF$var2 > var2CO[2]] <- NA
        } else {
            DF$var2[DF$taxonMod != refTaxon & DF$var2 < var2CO[1]] <- 0
            DF$var2[DF$taxonMod != refTaxon & DF$var2 > var2CO[2]] <- 0
        }
    } else {
        if (var1Rel == "protein") {
            DF$var1[DF$var1 < var1CO[1]] <- NA
            DF$var1[DF$var1 > var1CO[2]] <- NA
        } else {
            DF$var1[DF$var1 < var1CO[1]] <- 0
            DF$var1[DF$var1 > var1CO[2]] <- 0
        }
        if (var2Rel == "protein") {
            DF$var2[DF$var2 < var2CO[1]] <- NA
            DF$var2[DF$var2 > var2CO[2]] <- NA
        } else {
            DF$var2[DF$var2 < var2CO[1]] <- 0
            DF$var2[DF$var2 > var2CO[2]] <- 0
        }
    }
    ### calculate % present taxa and filter by percentCO
    DFtmp <- DF[stats::complete.cases(DF), ]
    finalPresSpecDt <- calcPresSpec(DFtmp, taxaCount)
    DF <- Reduce(
        function(x, y) merge(x, y, by = c("geneID", "supertaxon"), all.x=TRUE),
        list(DF, finalPresSpecDt))
    DF$presSpec[DF$presSpec < percentCO[[1]] | DF$presSpec > percentCO[[2]]] <-0
    DF$orthoID[DF$presSpec == 0] <- NA
    if (var1Rel == "protein") {
        DF$orthoID[is.na(DF$var1)] <- NA
        DF$var1[is.na(DF$orthoID)] <- NA
    }
    if (var2Rel == "protein") {
        DF$orthoID[is.na(DF$var2)] <- NA
        DF$var2[is.na(DF$orthoID)] <- NA
    }
    DF$presSpec[is.na(DF$orthoID)] <- 0
    DF$var1[DF$presSpec == 0] <- NA
    DF$var2[DF$presSpec == 0] <- NA
    
    ### remove paralog count if NOT working with lowest rank (species/strain)
    if (flag == 1) DF$paralog <- 1
    
    DF <- droplevels(DF)  # delete unused levels
    DF$geneID <- as.factor(DF$geneID)
    DF$supertaxon <- as.factor(DF$supertaxon)
    
    # calculate max/min/mean/median VAR1 for every supertaxon of each gene
    DFNoNA <- DF[!is.na(DF$var1), ]
    mVar1Dt <- stats::aggregate(
        DFNoNA[, "var1"],
        list(DFNoNA$supertaxon, DFNoNA$geneID),
        FUN = var1AggregateBy)
    colnames(mVar1Dt) <- c("supertaxon", "geneID", "mVar1")
    # calculate max/min/mean/median VAR2 for each supertaxon
    DFNoNAVar2 <- DF[!is.na(DF$var2), ]
    if (nrow(DFNoNAVar2) > 0) {
        mVar2Dt <- stats::aggregate(
            DFNoNAVar2[, "var2"],
            list(DFNoNAVar2$supertaxon, DFNoNAVar2$geneID),
            FUN = var2AggregateBy)
        colnames(mVar2Dt) <- c("supertaxon", "geneID", "mVar2")
    } else {
        mVar2Dt <- DF[, c("supertaxon", "geneID")]
        mVar2Dt$mVar2 <- 0
    }
    # join mVar2 together with mVar1 scores into one df
    scoreDf <- merge(mVar1Dt, mVar2Dt, by = c("supertaxon","geneID"), all=TRUE)
    # add into DF
    DF <- Reduce(
        function(x, y) merge(x, y, by = c("geneID", "supertaxon"), all.x=TRUE),
        list(DF, scoreDf))
    
    ### add gene categories (if provided)
    if (groupByCat == TRUE) {
        if (is.null(catDt)) {
            catDt <- data.frame( geneID = levels(DF$geneID))
            catDt$group <- "noCategory"
        }
        dfCat <- data.frame(
            supertaxon = rep(levels(DF$supertaxon), nlevels(DF$geneID)),
            geneID = rep(levels(DF$geneID), each = nlevels(DF$supertaxon)))
        dfCat <- merge(dfCat, catDt, by = "geneID")
        DF <- merge(dfCat, DF, by = c("geneID","supertaxon"), all.x = TRUE)
        DF$category <- DF$group
    }
    return(DF[!is.na(DF$orthoID),])
}

#' Reduce the filtered profile data into supertaxon level
#' @description Reduce data of the processed phylogenetic profiles from input
#' taxonomy rank into supertaxon level (e.g. from species to phylum)
#' @param filteredProfile dataframe contains the filtered profiles (see
#' ?parseInfoProfile, ?filterProfileData and ?filteredProfile)
#' @return A reduced dataframe contains only profile data for the selected
#' supertaxon rank. This dataframe contains only supertaxa and their value
#' (mVar1 & mVar2) for each gene.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{parseInfoProfile}} for creating a full processed
#' profile dataframe, \code{\link{filterProfileData}} for filter processed 
#' profile and \code{\link{filteredProfile}} for a demo filtered
#' profile dataframe
#' @examples
#' data("filteredProfile", package="PhyloProfile")
#' reduceProfile(filteredProfile)

reduceProfile <- function(filteredProfile) {
    if (is.null(filteredProfile)) stop("Profile data cannot be NULL!")
    
    # check if working with the lowest taxonomy rank; 1 for NO; 0 for YES
    flag <- 1
    if (length(unique(levels(as.factor(filteredProfile$numberSpec)))) == 1) {
        if (unique(levels(as.factor(filteredProfile$numberSpec))) == 1) {
            superDfExt <- filteredProfile[, c(
                "geneID", "supertaxon", "supertaxonID",
                "var1", "presSpec", "category", "orthoID", "var2", "paralog"
            )]
            flag <- 0
        }
    }
    if (flag == 1) {
        # get representative orthoID that has m VAR1 for each supertaxon
        mOrthoID <- filteredProfile[, c(
            "geneID", "supertaxon", "var1", "mVar1", "orthoID", "presSpec"
        )]
        mOrthoID <- subset(mOrthoID, mOrthoID$var1 == mOrthoID$mVar1)
        colnames(mOrthoID) <- c(
            "geneID", "supertaxon", "var1", "mVar1", "orthoID", "presSpec"
        )
        mOrthoID <- mOrthoID[!is.na(mOrthoID$orthoID), ]
        mOrthoID <- mOrthoID[, c("geneID", "supertaxon", "orthoID", "presSpec")]
        mOrthoID <- mOrthoID[!duplicated(mOrthoID[, seq_len(2)]), ]
        # get data set for PhyloProfile plotting (contains only supertaxa info)
        superDf <- subset(filteredProfile, select = c(
            "geneID", "supertaxon", "supertaxonID",
            "mVar1", "category", "mVar2", "paralog"
        ))
        superDf <- superDf[!duplicated(superDf), ]
        superDfExt <- merge(
            superDf, mOrthoID, by = c("geneID", "supertaxon"), all.x = TRUE
        )
        superDfExt <- superDfExt[, c(
            "geneID", "supertaxon", "supertaxonID",
            "mVar1", "presSpec", "category", "orthoID", "mVar2", "paralog"
        )]
        # rename mVar to var
        names(superDfExt)[names(superDfExt) == "mVar1"] <- "var1"
        names(superDfExt)[names(superDfExt) == "mVar2"] <- "var2"
    }
    return(superDfExt)
}

#' Complete processing of raw input phylogenetic profiles
#' @description Create a processed and filtered data for plotting or analysing
#' phylogenetic profiles from raw input file (from raw input to final filtered
#' dataframe)
#' @usage fromInputToProfile(rawInput, rankName, refTaxon = NULL,
#'     taxaTree = NULL, var1AggregateBy = "max", var2AggregateBy = "max",
#'     percentCutoff = c(0, 1), coorthologCutoffMax = 9999,
#'     var1Cutoff = c(0, 1), var2Cutoff = c(0, 1), var1Relation = "protein",
#'     var2Relation = "protein", groupByCat = FALSE, catDt = NULL)
#' @param rawInput input file (in long, wide, multi-fasta or orthoxml format)
#' @param rankName taxonomy rank (e.g. "species","phylum",...)
#' @param refTaxon selected reference taxon name (used for sorting and will be
#' protected from filtering). Default = NULL.
#' @param taxaTree input taxonomy tree for taxa in input profiles (optional).
#' Default = NULL.
#' @param var1AggregateBy aggregate method for var1 (min, max, mean or median).
#' Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (min, max, mean or median).
#' Default = "max".
#' @param percentCutoff min and max cutoffs for percentage of species present
#' in a supertaxon. Default = c(0, 1).
#' @param coorthologCutoffMax maximum number of co-orthologs allowed. Default =
#' 9999.
#' @param var1Cutoff min and max cutoffs for var1. Default = c(0, 1).
#' @param var2Cutoff min and max cutoffs for var2. Default = c(0, 1).
#' @param var1Relation relation of var1 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param var2Relation relation of var2 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param groupByCat group genes by their categories (TRUE or FALSE). Default =
#' FALSE.
#' @param catDt dataframe contains gene categories. Default = NULL.
#' @return Dataframe required for generating phylogenetic profile plot or
#' clustering analysis. It contains seed gene IDs (or orthologous group IDs),
#' their ortholog IDs and the corresponding (super)taxa, (super)taxon IDs,
#' number of co-orthologs in each (super)taxon, values for two additional
#' variables var1, var2, % of species present in each supertaxon, and the
#' categories of seed genes (or ortholog groups).
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{getInputTaxaID}},
#' \code{\link{getInputTaxaName}}, \code{\link{sortInputTaxa}},
#' \code{\link{parseInfoProfile}}, \code{\link{reduceProfile}},
#' \code{\link{filterProfileData}}
#' @examples
#' rawInput <- system.file(
#'     "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' rankName <- "class"
#' refTaxon <- "Mammalia"
#' taxaTree <- NULL
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "mean"
#' percentCutoff <- c(0.0, 1.0)
#' coorthologCutoffMax <- 10
#' var1Cutoff <- c(0.75, 1.0)
#' var2Cutoff <- c(0.5, 1.0)
#' var1Relation <- "protein"
#' var2Relation <- "species"
#' groupByCat <- FALSE
#' catDt <- NULL
#' fromInputToProfile(
#'     rawInput,
#'     rankName,
#'     refTaxon,
#'     taxaTree,
#'     var1AggregateBy,
#'     var2AggregateBy,
#'     percentCutoff,
#'     coorthologCutoffMax,
#'     var1Cutoff,
#'     var2Cutoff,
#'     var1Relation,
#'     var2Relation,
#'     groupByCat,
#'     catDt
#' )

fromInputToProfile <- function(
    rawInput,
    rankName,
    refTaxon = NULL,
    taxaTree = NULL,
    var1AggregateBy = "max",
    var2AggregateBy = "max",
    percentCutoff = c(0, 1),
    coorthologCutoffMax = 9999,
    var1Cutoff = c(0, 1),
    var2Cutoff = c(0, 1),
    var1Relation = "protein",
    var2Relation = "protein",
    groupByCat = FALSE,
    catDt = NULL
) {
    if (missing(rawInput) | missing(rankName)) return("Missing input")
    if (is.null(rawInput) | is.null(rankName)) return("Missing input")
    # convert raw input into long format
    inputDf <- createLongMatrix(rawInput)
    # get input taxon IDs and names
    inputTaxonID <- getInputTaxaID(inputDf)
    # sort input taxa based on selected reference taxon or input taxonomy tree
    sortedInputTaxa <- sortInputTaxa(inputTaxonID, rankName, refTaxon, taxaTree)
    # count present taxa in each supertaxon
    taxaCount <- plyr::count(sortedInputTaxa, "supertaxon")
    # parse info (additional values...) into profile df
    fullMdData <- parseInfoProfile(inputDf, sortedInputTaxa, taxaCount)
    # filter profile
    filteredDf <- filterProfileData(
        fullMdData,
        taxaCount,
        refTaxon,
        percentCutoff,
        coorthologCutoffMax,
        var1Cutoff,
        var2Cutoff,
        var1Relation,
        var2Relation,
        groupByCat = FALSE,
        catDt = NULL, 
        var1AggregateBy, var2AggregateBy
    )
    # reduce profile df into supertaxon level
    dataHeat <- reduceProfile(filteredDf)
    return(dataHeat)
}
Any scripts or data that you put into this service are public.
PhyloProfile documentation built on March 27, 2021, 6:01 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PhyloProfile
PhyloProfile

R/parsePhyloProfile.R
In PhyloProfile: PhyloProfile

Defines functions fromInputToProfile reduceProfile filterProfileData parseInfoProfile calcPresSpec sortInputTaxa getSelectedTaxonNames getInputTaxaName getInputTaxaID getTaxonomyMatrix getNameList getTaxonomyRanks

Documented in calcPresSpec filterProfileData fromInputToProfile getInputTaxaID getInputTaxaName getNameList getSelectedTaxonNames getTaxonomyMatrix getTaxonomyRanks parseInfoProfile reduceProfile sortInputTaxa

Try the PhyloProfile package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PhyloProfile PhyloProfile

R/parsePhyloProfile.R In PhyloProfile: PhyloProfile

Defines functions fromInputToProfile reduceProfile filterProfileData parseInfoProfile calcPresSpec sortInputTaxa getSelectedTaxonNames getInputTaxaName getInputTaxaID getTaxonomyMatrix getNameList getTaxonomyRanks

Documented in calcPresSpec filterProfileData fromInputToProfile getInputTaxaID getInputTaxaName getNameList getSelectedTaxonNames getTaxonomyMatrix getTaxonomyRanks parseInfoProfile reduceProfile sortInputTaxa

Try the PhyloProfile package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PhyloProfile
PhyloProfile

R/parsePhyloProfile.R
In PhyloProfile: PhyloProfile