R/createTaxonomyMatrix.R

Defines functions taxonomyTableCreator getIDsRank getTaxonomyInfo processNcbiTaxonomy mainTaxonomyRank

Documented in getIDsRank getTaxonomyInfo mainTaxonomyRank processNcbiTaxonomy taxonomyTableCreator

#' Get all NCBI taxonomy rank names
#' @export
#' @return A list of all available NCBI taxonomy rank names.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @examples
#' mainTaxonomyRank()

mainTaxonomyRank <- function() {
    return(
        c(
            "strain","biotype","isolate","pathogroup","serogroup","serotype",
            "genotype","morph","forma","subspecies","subvariety","varietas",
            "formaspecialis","subspecies","species","speciessubgroup",
            "speciesgroup","series","subgenus","genus","subtribe","tribe",
            "subfamily","family","superfamily",
            "parvorder","infraorder","suborder","order","superorder",
            "subcohort","cohort","infraclass","subclass","class","superclass",
            "subphylum","phylum","superphylum",
            "subkingdom","kingdom","superkingdom"
        )
    )
}

#' Pre-processing NCBI taxonomy data
#' @description Download NCBI taxonomy database and parse information that are
#' needed for PhyloProfile, including taxon IDs, their scientific names,
#' systematic ranks, and parent (next higher) rank IDs.
#' @return A dataframe contains NCBI taxon IDs, taxon names, taxon ranks and the
#' next higher taxon IDs (parent's IDs) of all taxa in the NCBI taxonomy
#' database.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @examples
#' ?processNcbiTaxonomy
#' \dontrun{
#' preProcessedTaxonomy <- processNcbiTaxonomy()
#' # save to text (tab-delimited) file
#' write.table(
#'     preProcessedTaxonomy,
#'     file = "preProcessedTaxonomy.txt",
#'     col.names = TRUE,
#'     row.names = FALSE,
#'     quote = FALSE,
#'     sep = "\t"
#' )
#' # save to rdata file
#' save(
#'     preProcessedTaxonomy, file = "preProcessedTaxonomy.RData", compress='xz'
#' )
#' }

processNcbiTaxonomy <- function() {
    temp <- tempfile()
    utils::download.file(
        "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip", temp
    )
    names <- utils::read.table(
        unz(temp, "names.dmp"), header = FALSE, fill = TRUE, sep = "\t", 
        quote = "", comment.char = "", stringsAsFactors = FALSE
    )
    nodes <- utils::read.table(
        unz(temp, "nodes.dmp"), header = FALSE, fill = TRUE, sep = "\t",
        quote = "", comment.char = "", stringsAsFactors = FALSE
    )
    unlink(temp)
    message("Download NCBI taxonomy done!")
    # Create data frame containing taxon ID, the scientific name, its taxonomy
    # rank and the taxon ID of the higer rank (parent's ID)
    outTax <- merge(
        names[names$V7 == "scientific name", c("V1", "V3")],
        nodes[,c("V1", "V5", "V3")],
        by = "V1"
    )
    colnames(outTax) <- c("ncbiID", "fullName", "rank","parentID")
    # Remove "'" from taxon names and ranks, remove space from taxon ranks
    outTax$fullName <- gsub("'", "",outTax$fullName)
    outTax$rank <- gsub("'", "", outTax$rank)
    outTax$rank <- gsub(" ", "", outTax$rank)
    # Add Holozoa and Holomycota ranks
    opiMod <- data.frame(
        ncbiID = c(999999001, 999999002, 999999003, 999999004, 999999005),
        fullName = c(
            "Choanozoa", "Filozoa", "Pluriformea", "Holozoa", "Holomycota"),
        rank = rep("norank", 5),
        parentID = c(999999002, 999999003, 999999004, 33154, 33154),
        stringsAsFactors = FALSE
    )
    outTax[outTax$ncbiID %in% c(28009, 33208),]$parentID <- 999999001
    outTax[outTax$ncbiID == 2687318,]$parentID <- 999999002
    outTax[outTax$ncbiID == 42461,]$parentID <- 999999003
    outTax[outTax$ncbiID == 127916,]$parentID <- 999999004
    outTax[outTax$ncbiID %in% c(4751, 2686024),]$parentID <- 999999005
    outTax <- do.call(rbind, list(outTax, opiMod))
    # Return
    message("Parsing NCBI taxonomy done!")
    return(outTax)
}

#' Get taxonomy info for a list of input taxa
#' @param inputTaxa NCBI taxonomy IDs of input taxa.
#' @param currentNCBIinfo table/dataframe of the pre-processed NCBI taxonomy
#' data (/PhyloProfile/data/preProcessedTaxonomy.txt)
#' @return A list of NCBI taxonomy info for input taxa, including the taxonomy
#' IDs, full scientific names, taxonomy ranks and the parent IDs.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @examples
#' inputTaxa <- c("272557", "176299")
#' ncbiFilein <- system.file(
#'     "extdata", "data/preProcessedTaxonomy.txt",
#'     package = "PhyloProfile", mustWork = TRUE
#' )
#' currentNCBIinfo <- as.data.frame(data.table::fread(ncbiFilein))
#' getTaxonomyInfo(inputTaxa, currentNCBIinfo)
getTaxonomyInfo <- function(inputTaxa = NULL, currentNCBIinfo = NULL) {
    tmp <- list()
    outList <- list()
    k <- 1
    for (refID in inputTaxa) {
        # get info for this taxon
        refEntry <- currentNCBIinfo[currentNCBIinfo$ncbiID == refID, ]
        lastID <- refEntry$parentID
        inputTaxaInfo <- refEntry
        while (lastID != 1) {
            if (lastID %in% names(tmp)) {
                inputTaxaInfo <- rbindlist(
                    list(inputTaxaInfo, tmp[[toString(lastID)]]), 
                    use.names = TRUE, fill = TRUE, idcol = NULL)
                lastID = 1
            } else {
                nextEntry <- currentNCBIinfo[currentNCBIinfo$ncbiID == lastID, ]
                inputTaxaInfo <- rbindlist(
                    list(inputTaxaInfo, nextEntry), use.names = TRUE,
                    fill = TRUE, idcol = NULL)
                lastID <- nextEntry$parentID
            }
        }
        for (i in seq(inputTaxaInfo$ncbiID)) {
            if (!(inputTaxaInfo$ncbiID[i] %in% names(tmp))) {
                tmp[[toString(inputTaxaInfo$ncbiID[i])]] <-
                    inputTaxaInfo[i:length(inputTaxaInfo$ncbiID)]
            }
        }
        outList[[k]] <- inputTaxaInfo
        k <- k + 1
    }
    return(outList)
}

#' Get taxonomy info for a list of taxa
#' @description Get NCBI taxonomy IDs, ranks and names for an input taxon list.
#' @param inputTaxa NCBI ID list of input taxa.
#' @param currentNCBIinfo table/dataframe of the pre-processed NCBI taxonomy
#' data (/PhyloProfile/data/preProcessedTaxonomy.txt)
#' @return A list of 3 dataframes: idList, rankList and reducedInfoList. The
#' "rankList" contains taxon names and all taxonomy ranks of the input taxa
#' including also the noranks from the input rank to the taxonomy root. The
#' "idList" contains input taxon IDs, taxon names, all the ranks from current
#' rank to the taxonomy root together with their IDs (with the format
#' "id#rank"). The reducedInfoList is a subset of preProcessedTaxonomy.txt file,
#' containing the NCBI IDs, taxon fullnames, their current rank and their
#' direct parent ID.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @export
#' @examples
#' inputTaxa <- c("272557", "176299")
#' ncbiFilein <- system.file(
#'     "extdata", "data/preProcessedTaxonomy.txt",
#'     package = "PhyloProfile", mustWork = TRUE
#' )
#' currentNCBIinfo <- as.data.frame(data.table::fread(ncbiFilein))
#' getIDsRank(inputTaxa, currentNCBIinfo)

getIDsRank <- function(inputTaxa = NULL, currentNCBIinfo = NULL){
    if (is.null(currentNCBIinfo)) stop("Pre-processed NCBI tax data is NULL!")
    inputTaxaInfo <- getTaxonomyInfo(inputTaxa, currentNCBIinfo)
    allMainRank <- mainTaxonomyRank()
    ## get reduced taxonomy info (subset of preProcessedTaxonomy.txt)
    reducedDf <- unique(rbindlist(inputTaxaInfo))
    ## get list of all ranks and rank#IDs
    rankMod <- ncbiID <- NULL
    inputRankIDDf <- lapply(
        seq_len(length(inputTaxaInfo)),
        function (x) {
            inputTaxaInfo[[x]]$rank[
                !(inputTaxaInfo[[x]]$rank %in% allMainRank)] <- "norank"
            inputTaxaInfo[[x]]$rankMod <- inputTaxaInfo[[x]]$rank
            if (inputTaxaInfo[[x]]$rank[1] == "norank")
                inputTaxaInfo[[x]]$rankMod[1] <-
                    paste0("strain_", inputTaxaInfo[[x]]$ncbiID[1])
            inputTaxaInfo[[x]]$rankMod <- with(
                inputTaxaInfo[[x]],
                ifelse(rankMod == "norank", paste0("norank_", ncbiID), rankMod))
            inputTaxaInfo[[x]]$id <- paste0(
                inputTaxaInfo[[x]]$ncbiID, "#", inputTaxaInfo[[x]]$rankMod)
            return(inputTaxaInfo[[x]][ , c("rankMod", "id")])
        }
    )
    inputRankList <- lapply(
        seq_len(length(inputRankIDDf)),
        function (x) {
            ll <- c(
                paste0("ncbi", inputTaxa[x]),
                reducedDf$fullName[reducedDf$ncbiID == inputTaxa[x]],
                inputRankIDDf[[x]]$rank, "norank_1")
            ll <- gsub("strain_[[:digit:]]+", "strain", ll)
            return(data.frame(
                matrix(ll, nrow = 1, byrow = TRUE), stringsAsFactors = FALSE))
        }
    )
    inputRankDf <- do.call(plyr::rbind.fill, inputRankList)
    inputIDList <- lapply(
        seq_len(length(inputRankIDDf)),
        function (x) {
            ll <- c(paste0(
                reducedDf$fullName[reducedDf$ncbiID==inputTaxa[x]],"#name"),
                inputRankIDDf[[x]]$id, "1#norank_1")
            return(data.frame(
                matrix(ll, nrow = 1, byrow = TRUE), stringsAsFactors = FALSE))
        }
    )
    inputIDDf <- do.call(plyr::rbind.fill, inputIDList)
    newCol <- seq(ncol(inputIDDf) + 1, ncol(inputRankDf))
    inputIDDf[paste0("X", newCol)] <- NA
    reducedDf$rank[!(reducedDf$rank %in% allMainRank)] <- "norank"
    return(list(inputIDDf, inputRankDf, as.data.frame(reducedDf)))
}

#' Indexing all available ranks (including norank)
#' @param rankListFile Input file, where each row is a rank list of a taxon
#' (see rankListFile in example)
#' @return A dataframe containing a list of all possible ranks and their indexed
#' values.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @examples
#' \dontrun{
#' rankListFile <- system.file(
#'     "extdata", "data/rankList.txt", package = "PhyloProfile", mustWork = TRUE
#' )
#' rankIndexing(rankListFile)
#' }

rankIndexing <- function (rankListFile = NULL) {
    if (is.null(rankListFile)) stop("Rank list file is NULL!")
    rankList <- utils::read.table(
        rankListFile, sep = '\t', header = FALSE,fill = TRUE,
        stringsAsFactors = TRUE, na.strings = c("", "NA")
    )
    uList <- unlist(rankList[seq(3, length(rankList))])
    allInputRank <- as.character(unique(uList))
    allInputRank <- allInputRank[!is.na(allInputRank)]
    ### initial index for main ranks
    mainRank <- mainTaxonomyRank()
    rank2index <- new.env(hash = TRUE)
    getHash <- Vectorize(get, vectorize.args = "x")
    assignHash <- Vectorize(assign, vectorize.args = c("x", "value"))
    for (i in seq_len(length(mainRank))) rank2index[[mainRank[i]]] <- i
    ### the magic happens here
    for (k in seq_len(nrow(rankList))) {
        ## get rank list for current taxon containing only ranks in allInputRank
        subList <- rankList[k,][!is.na(rankList[k,])]
        filter <- vapply(
            subList, function(x) x %in% allInputRank, FUN.VALUE = logical(1))
        subList <- subList[filter]
        ## indexing
        tmpEnv <- new.env(hash = TRUE)
        flag <- 0
        for (i in seq_len(length(subList))) {
            iRank <- subList[i]
            if (is.null(rank2index[[iRank]])) {
                # for new rank: get index of prev avail from this taxon
                for (j in seq_len(length(subList))) {
                    if (j < i) {
                        if (!is.null(tmpEnv[[subList[i - j]]])) {
                            tmpEnv[[iRank]] <- tmpEnv[[subList[i - j]]] + 1
                            break
                        }
                    } else j = j - 1
                }
            } else {
                # for old rank
                if (i > 1) {
                    if (flag == 0) {
                        currentIndex <- rank2index[[iRank]]
                    } else {
                        currentIndex <- tmpEnv[[iRank]]
                    }
                    if (currentIndex <= tmpEnv[[subList[i-1]]]) {
                        if (flag == 0) {
                            tmpEnv[[iRank]] <- tmpEnv[[subList[i-1]]] + 1
                            for (
                                r in 
                                ls(rank2index)[!(ls(rank2index)%in%ls(tmpEnv))]
                            ) {
                                if (rank2index[[r]] > currentIndex) {
                                    tmpEnv[[r]] <- 
                                        rank2index[[r]] + 
                                        (tmpEnv[[iRank]] - rank2index[[iRank]])
                                    flag <- 1
                                }
                            }
                        } else {
                            step <- tmpEnv[[subList[i-1]]] - currentIndex + 1
                            for (n in ls(rank2index)) {
                                if (rank2index[[n]] >= currentIndex) {
                                    tmpEnv[[n]] <- rank2index[[n]] + step
                                }
                            }
                        }
                        assignHash(
                            ls(tmpEnv), getHash(ls(tmpEnv), tmpEnv), rank2index
                        )
                    } else {
                        if (is.null(tmpEnv[[iRank]])) {
                            tmpEnv[[iRank]] <- rank2index[[iRank]]
                        }
                    }
                } else {
                    tmpEnv[[iRank]] <- rank2index[[iRank]]
                }
            }
        }
        assignHash(ls(tmpEnv), getHash(ls(tmpEnv), tmpEnv), rank2index)
    }
    # convert env into dataframe and return
    index2RankList <- lapply(
        seq_len(length(allInputRank)), function (x) {
            data.frame(
                index = rank2index[[allInputRank[x]]],
                rank = allInputRank[x], stringsAsFactors = FALSE
            )
        }
    )
    index2RankDf <- do.call(rbind, index2RankList)
    index2RankDf <- index2RankDf[with(index2RankDf, order(index2RankDf$index)),]
    return(index2RankDf)
}

#' Align NCBI taxonomy IDs of list of taxa into a sorted rank list.
#' @export
#' @param idListFile a text file whose each row is a rank+ID list of a taxon
#' (see idListFile in example)
#' @param rankListFile a text file whose each row is a rank list of a taxon
#' (see rankListFile in example)
#' @return An aligned taxonomy dataframe which contains all the available
#' taxonomy ranks from the id and rank list file. This dataframe can be used for
#' creating a well resolved taxonomy tree (see ?createRootedTree) and sorting
#' taxa based on a selected reference taxon (see ?sortInputTaxa).
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @seealso \code{\link{rankIndexing}}, \code{\link{createRootedTree}},
#' \code{\link{sortInputTaxa}}
#' @examples
#' idListFile <- system.file(
#'     "extdata", "data/idList.txt", package = "PhyloProfile", mustWork = TRUE
#' )
#' rankListFile <- system.file(
#'     "extdata", "data/rankList.txt", package = "PhyloProfile", mustWork = TRUE
#' )
#' taxonomyTableCreator(idListFile, rankListFile)

taxonomyTableCreator <- function(idListFile = NULL, rankListFile = NULL) {
    if (is.null(idListFile) | is.null(rankListFile))
        stop("Taxonomy ID list or rank list file is NULL!")
    index <- NULL
    index2RankDf <- rankIndexing(rankListFile)
    ncol <- max(utils::count.fields(rankListFile, sep = '\t'))
    idList <- utils::read.table(
        idListFile, sep = '\t', header = FALSE, check.names = FALSE,
        comment.char = "", fill = TRUE, stringsAsFactors = TRUE,
        na.strings = c("","NA"), col.names = paste0('X', seq_len(ncol)))
    colnames(idList)[1] <- "tip"
    orderedRank <- factor(index2RankDf$rank, levels = index2RankDf$rank)
    ### create a dataframe containing ordered ranks
    fullRankIDdf <- data.frame(
        rank = matrix(
            unlist(orderedRank), nrow = length(orderedRank), byrow = TRUE
        ), stringsAsFactors = FALSE)
    fullRankIDdf$index <- as.numeric(rownames(fullRankIDdf))
    fullRankIDdf <- data.table(fullRankIDdf)
    setkey(fullRankIDdf, rank)
    mTaxonDf <- pbapply::pblapply(
        seq_len(nrow(idList)),
        function (x) {
            ### get list of all IDs for this taxon & convert into long format
            taxonDf <- data.frame(idList[x,])
            taxonName <- unlist(
                strsplit(as.character(idList[x,]$tip), "#", fixed = TRUE))
            mTaxonDf <- suppressWarnings(
                data.table::melt(setDT(taxonDf), id = "tip")
            )
            ### get rank names and corresponding IDs, then remove NA cases
            splitCol <- data.frame(
                do.call(
                    'rbind',
                    strsplit(as.character(mTaxonDf$value), '#', fixed = TRUE)
                )
            )
            mTaxonDf <- cbind(mTaxonDf, splitCol)
            mTaxonDf <- mTaxonDf[stats::complete.cases(mTaxonDf),]
            ### subselect mTaxonDf to keep only 2 column rank id and rank name
            mTaxonDf <- mTaxonDf[, c("X1","X2")]
            if (mTaxonDf$X2[1] != index2RankDf$rank[1])
                mTaxonDf <- rbind(
                    data.frame(
                        "X1" = mTaxonDf$X1[1], "X2" = index2RankDf$rank[1]
                    ), mTaxonDf)
            ### rename columns & return
            colnames(mTaxonDf) <- c(taxonName[1], "rank")
            mTaxonDf <- data.table(mTaxonDf)
            setkey(mTaxonDf, rank)
            return(mTaxonDf)
        }
    )
    ### merge into data frame contains all available ranks from input
    mTaxonDfFull <- c(list(fullRankIDdf), mTaxonDf)
    fullRankIDdf <- Reduce(
        function (x, y) merge(x , y, all.x = TRUE, allow.cartesian = TRUE),
        mTaxonDfFull
    )
    ### reorder ranks & replace NA id by id of previous rank
    fullRankIDdf <- fullRankIDdf[order(fullRankIDdf$index),]
    fullRankIDdf <- zoo::na.locf(fullRankIDdf)
    ### remove index column & transpose into wide format
    fullRankIDdf <- subset(fullRankIDdf, select = -c(index))
    tFullRankIDdf <- data.table::transpose(fullRankIDdf)
    ### set first row to column names
    colnames(tFullRankIDdf) = as.character(unlist(tFullRankIDdf[1,]))
    tFullRankIDdf <- tFullRankIDdf[-1,]
    ### add "abbrName  ncbiID  fullName" columns
    fullRankIDdfOut <- data.frame(
        abbrName = paste0("ncbi", unlist(tFullRankIDdf[,1])),
        ncbiID = unlist(tFullRankIDdf[,1]),
        fullName = colnames(fullRankIDdf)[-c(1)], stringsAsFactors = FALSE)
    fullRankIDdfOut <- cbind(fullRankIDdfOut, tFullRankIDdf)
    ### rename last column to "root"
    names(fullRankIDdfOut)[ncol(fullRankIDdfOut)] <- "root"
    numericCol <- c(seq(4, ncol(fullRankIDdfOut)))
    fullRankIDdfOut[,numericCol] <- 
        as.numeric(as.character(unlist(fullRankIDdfOut[,numericCol])))
    return(fullRankIDdfOut)
}

Try the PhyloProfile package in your browser

Any scripts or data that you put into this service are public.

PhyloProfile documentation built on March 27, 2021, 6:01 p.m.