R/tax_utilities.R

Defines functions reformat_hierarchy

Documented in reformat_hierarchy

# Reformat hierarchy ------------------------------------------------------

#' Reformat hierarchy
#'
#' @param x A DNAbin with names formatted Accession|taxid;taxonomy
#' @param db A database file generated by `get_ncbi_taxonomy` or `get_ott_lineage`. Required if get_lineage is TRUE
#' @param quiet Whether progress should be printed to console
#' @param ranks The taxonomic ranks to return in the output names
#' @param multithread Whether multithreading should be used, if TRUE the number of cores will be automatically detected (Maximum available cores - 1), or provide a numeric vector to manually set the number of cores to use. Default is FALSE (single thread)
#' This argument may alternatively be a 'cluster' object, in which case it is the user's responsibility to close the socket connection at the conclusion of the operation,
#' for example by running parallel::stopCluster(cores).
#'
#' @return
#' @export
#' @import stringr
#' @import dplyr
#' @importFrom tidyr separate
#' @importFrom tidyr unite
#' @importFrom magrittr set_colnames
#' @importFrom tibble as_tibble
#' @importFrom ape as.DNAbin
#' @importFrom methods is
#'
reformat_hierarchy <- function(x, db = NULL, quiet = FALSE,
                               ranks = c("kingdom", "phylum", "class", "order", "family", "genus", "species"), multithread = FALSE) {
  time <- Sys.time() # get time
  # Convert to DNAbin
  if (!methods::is(x, "DNAbin")) {
    x <- ape::as.DNAbin(x)
  }
  if (missing(db)) {
    stop("A taxonomic database needs to be provided, generate one with get_ncbi_taxonomy or get_ott_taxonomy")
  }
  if(attr(db, "type") == "ncbi"){
    # Split current names
    seqnames <- names(x) %>%
      stringr::str_remove(";$") %>%
      stringr::str_split_fixed(";", n = Inf) %>%
      tibble::as_tibble() %>%
      tidyr::separate(col = V1, into = c("acc", "tax_id"), sep = "\\|")%>%
      dplyr::select(1:2, tail(names(.), 1)) %>%
      magrittr::set_colnames(c("acc", "tax_id", "species")) %>%
      dplyr::mutate(tax_id = suppressWarnings(as.numeric(tax_id)))

    # Get lineage from taxid
    lineage <- seqnames %>%
      dplyr::left_join (db %>% dplyr::select(-species), by = "tax_id")  %>%
      tidyr::unite(col = Acc, c(acc, tax_id), sep = "|") %>%
      tidyr::unite(col = names, c(!!ranks), sep = ";")
  } else if(attr(db, "type") == "OTT"){
    lineage <- get_ott_lineage(x, db=db, ranks=ranks, multithread=multithread) %>%
      tidyr::unite(col = names, c(!!ranks), sep = ";")
  }

  names(x) <- paste(lineage$Acc, lineage$names, "", sep = ";") %>%
  stringr::str_remove(";$")
  time <- Sys.time() - time
  if (!quiet) (message(paste0("Reformatted annotations for ", length(x), " sequences in ", format(time, digits = 2))))
  return(x)
}


# Reformat DADA2 Genus ------------------------------------------------------
#' Reformat DADA2 Genus
#' @param x A DNAbin or DNAStringSet object with names in format `accession|tax_id;Genus species`
#' @param quiet Whether progress should be printed to the console.
#' @param ranks The taxonomic ranks to include in output. default is kingdom;phylum;class;order;family;genus
#' @param db A database file generated by `get_ncbi_taxonomy` or `get_ott_lineage`. Required if get_lineage is TRUE
#'
#' @return
#' @export
#'
#'
reformat_dada2_gen <- function(x, db = NULL, ranks = c("kingdom", "phylum", "class", "order", "family", "genus"), quiet = FALSE) {
  reformat_hierarchy(x, db = db, quiet = quiet, ranks = ranks)
}


# Reformat DADA2 Species ----------------------------------------------------
#' Reformat DADA2 Species
#'
#' @param x A DNAbin with names formatted Accession|taxid;taxonomy
#' @param quiet Whether progress should be printed to console
#'
#' @return
#' @export
#' @import stringr
#' @import dplyr
#' @importFrom magrittr set_colnames
#' @importFrom tibble as_tibble
#' @importFrom ape as.DNAbin
#'
reformat_dada2_spp <- function(x, quiet = FALSE) {
  time <- Sys.time() # get time
  # Convert to DNAbin
  if (!methods::is(x, "DNAbin")) {
    x <- ape::as.DNAbin(x)
  }
  # Split current names
  seqnames <- names(x) %>%
    stringr::str_replace(";$", "") %>%
    stringr::str_split_fixed(";", n = Inf) %>%
    tibble::as_tibble() %>%
    dplyr::select(1, tail(names(.), 1)) %>%
    magrittr::set_colnames(c("acc", "species")) %>%
    dplyr::mutate(species = stringr::str_replace(species, pattern="_", replacement=" "))

  names(x) <- paste(seqnames$acc, seqnames$species, sep = " ")
  time <- Sys.time() - time
  if (!quiet) (message(paste0("Reformatted annotations for ", length(x), " sequences in ", format(time, digits = 2))))
  return(x)
}


# train IDTAXA -----------------------------------------------------------

#' Train IDTAXA
#'
#' @param x A DNAbin
#' @param max_group_size The maximum size of any taxonomic group. This can be set to Inf (infinity) to allow for an unlimited number of sequences per group.
#' @param max_iterations The number of iterations to conduct training for in order to identify  any training sequences whose assigned classifications completely disagree with their predicted classification.
#' @param allow_group_removal  Whether sequences that are the last remaining representatives of an entire group in the training data can be removed.
#' This can occur if the entire group appears to be misplaced in the taxonomic tree.
#' @param orient Training sequences must all be in the same orientation. Set this to TRUE to reorient the sequences if you are unsure.
#' @param get_lineage Get full taxonomic lineage using reformat_hierarchy if not already present.
#' @param db A database file generated by `get_ncbi_taxonomy` or `get_ott_lineage`. Required if get_lineage is TRUE.
#' @param quiet Whether progress should be printed to console.
#'
#' @return
#' @export
#' @import stringr
#' @importFrom DECIPHER RemoveGaps
#' @importFrom DECIPHER OrientNucleotides
#' @importFrom DECIPHER LearnTaxa
#'
train_idtaxa <- function(x, max_group_size=10, max_iterations = 3,  allow_group_removal = TRUE, orient=FALSE,
                          get_lineage=FALSE, db = NULL, quiet = FALSE) {
  time <- Sys.time() # get time

  #Reformat to complete taxonomic hierarchy
  if(get_lineage & !is.null(db)){
    seqs <- taxreturn::reformat_hierarchy(x, db=db, quiet=FALSE)
  } else if(get_lineage &  is.null(db)){
    stop("If get_lineage is TRUE, a db needs to be provided")
  } else  (seqs <- x)

  #Remove NA's
  if(length(names(seqs)[stringr::str_detect(names(seqs), ";NA;")]) > 1){
    remove <- names(seqs)[stringr::str_detect(names(seqs), ";NA;")]
    subset <- seqs[!names(seqs) %in% remove]
    message(paste0(length(seqs) - length(subset)," Sequences with NA's in taxonomy removed"))
    seqs <- subset
  }

  # Convert to DNAstringset for DECIPHER
  if(methods::is(seqs, "DNAbin")){
    seqs <- taxreturn::DNAbin2DNAstringset(seqs)
  }

  # Remove gaps
  if(!quiet){ message("Removing gaps")}
  seqs <- DECIPHER::RemoveGaps(seqs)

  if(orient){
    if(!quiet){ message("Orienting sequences")}
    seqs <- DECIPHER::OrientNucleotides(seqs)
  }

  # As taxonomies are encoded in the sequence names rather than a separate file, use:
  taxid <- NULL

  #Add Root Rank
  names(seqs) <- names(seqs)  %>% stringr::str_replace(";[;]*", ";Root;")

  # obtain the taxonomic assignments
  groups <- names(seqs) # sequence names

  # assume the taxonomy begins with 'Root;'
  groups <- gsub("(.*)(Root;)", "\\2", groups) # extract the group label
  groupCounts <- table(groups)
  u_groups <- names(groupCounts) # unique groups

  if (!quiet) {message(paste0(length(u_groups), " unique species"))}

  # Pruning training set
  remove <- logical(length(seqs))
  for (i in which(groupCounts > max_group_size)) {
    index <- which(groups==u_groups[i])
    keep <- sample(length(index),
                   max_group_size)
    remove[index[-keep]] <- TRUE
  }
  if (!quiet) {message(paste0(sum(remove), " sequences pruned from over-represented groups"))}

  # Training the classifier
  probSeqsPrev <- integer() # suspected problem sequences from prior iteration

  train <- seqs[!remove]
  taxonomy <- gsub("(.*)(Root;)", "\\2", names(train))
  remove <- logical(length(train))

  for (i in seq_len(max_iterations)) {
    if (!quiet) {message("Training iteration: ", i, "\n", sep="")}
    # train the classifier
    trainingSet <- DECIPHER::LearnTaxa(train[!remove], taxonomy[!remove], taxid)

    # look for problem sequences
    probSeqs <- trainingSet$problemSequences$Index
    if (length(probSeqs)==0) {
      if (!quiet) {message("No problem sequences remaining.\n")}
      break
    } else if (length(probSeqs)==length(probSeqsPrev) &&
               all(probSeqsPrev==probSeqs)) {
      if (!quiet) {message("Iterations converged.\n")}
      break
    }
    if (i==max_iterations)
      break
    probSeqsPrev <- probSeqs

    # remove any problem sequences
    index <- which(!remove)[probSeqs]
    remove[index] <- TRUE # remove all problem sequences
    if (!allow_group_removal) {
      # replace any removed groups
      missing <- !(u_groups %in% groups[!remove])
      missing <- u_groups[missing]
      if (length(missing) > 0) {
        index <- index[groups[index] %in% missing]
        remove[index] <- FALSE # don't remove
      }
    }
  }
  if (!quiet) {message(paste0(sum(remove), " sequences removed"))}
  if (!quiet) {message(paste0(length(probSeqs), " problem sequences remaining"))}

  time <- Sys.time() - time
  if (!quiet) (message(paste0("trained IDTAXA on ", length(x), " sequences in ", format(time, digits = 2))))
  return(trainingSet)
}


# taxonomy_to_newick ------------------------------------------------------
#' tax2tree
#'
#' @param x a DNAbin object or an object coercible to DNAbin
#' @param ranks The taxonomic ranks currently assigned to the names
#' @param depth The depth within the tree to return. Default NULL to return lowest input rank
#' @param summarise Select a taxonomic rank to summarise below. Default is FALSE to return a tree of the same size as input
#' @param output The output to return, options are:
#' "phylo" to return an ape phylo object. If summarise is set to a rank, the number of distinct taxa at that rank numbers are appended to the tip labels if append_sum is TRUE, or returned as an attribute and can be accessed with attr(tree, "sum")
#' "data.tree" to return a data.tree node object
#' "treedf" to return a simplified tree with taxon summaries in a data frame
#' "newick" to return a newick file for visualisation in other software. If summarise is set to a rank, the number of distinct taxa at that rank numbers are returned as an attribute and can be accessed with attr(tree, "sum")
#' @param replace_bads An option to automatically replace invalid newick characters with '-'
#' @param append_sum An option to append the summary number to the output trees tip labels when summarise is a rank and output is 'phylo'.
#' If FALSE, summary numbers are returned as an attribute and can be accessed with attr(tree, "sum")
#'
#' @return a phylo, data.tree, newick, or treedf object
#' @import stringr
#' @import dplyr
#' @importFrom tidyr unite
#' @importFrom phytools read.newick
#' @importFrom magrittr set_colnames
#' @importFrom data.tree as.Node
#' @importFrom data.tree ToNewick
#' @importFrom data.tree ToDataFrameTree
#' @importFrom ape as.DNAbin
#' @importFrom ape base.freq
#' @importFrom methods is
#' @export
#'
#'
tax2tree <- function(x, ranks = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
                     depth=NULL, summarise = FALSE,  output="phylo", replace_bads=FALSE, append_sum = TRUE){

  # start timer
  time <- Sys.time()

  #Checks
  if (!methods::is(x, "DNAbin")) {
    x <- ape::as.DNAbin(x)
    if (all(is.na(ape::base.freq(x)))) {stop("Error: Object is not coercible to DNAbin \n")}
  }
  if (!output %in% c("phylo", "treedf", "data.tree", "newick")) { stop("Output must be 'phylo', 'data.tree', 'newick' or 'treedf'")}

  if(any(stringr::str_detect(names(x) %>% stringr::str_remove("\\|.*$"), "\\,|\\;|\\:|\\(|\\)|\\[|\\]")) && output %in% c("newick", "phylo")){
    if(replace_bads){
      names(x) <- names(x) %>%
        stringr::str_split_fixed("\\|", n=2) %>%
        as.data.frame() %>%
        dplyr::mutate(
          V1 = V1 %>%
            stringr::str_remove("\\|.*$") %>%
            stringr::str_replace_all("\\,|\\;|\\:|\\(|\\)|\\[|\\]", "_")
        ) %>%
        tidyr::unite("names", everything(), sep="|") %>%
        dplyr::pull(names)

    }else{
      stop("Sequence accessions contain one or more of the invalid characters , ; : ( ) [ ] charcters, set replace_bads to TRUE to replace with '-' ")
    }
  }

  # leave an autodetect for ranks?
  ranks <- stringr::str_to_lower(ranks)

  if (!is.null(depth)){
    ranks <- ranks[1:depth]
  }

  if(methods::is(summarise, "character")){
    summarise <- stringr::str_to_lower(summarise)

    if(summarise %in% ranks){
      groupranks <- ranks[1:match(summarise, ranks)]
    } else {
      stop("Summarise must be one of the values in ranks")
    }
  }
  #Get taxonomic lineage
  lineage <- names(x) %>%
    stringr::str_remove(pattern=";$") %>%
    stringr::str_split_fixed(";", n=Inf) %>%
    as.data.frame() %>%
    magrittr::set_colnames(c("Acc", ranks ))

  lineage <- lineage[,!is.na(colnames(lineage))]

  if(methods::is(summarise, "character")){
    lineage <- lineage %>%
      dplyr::select(-Acc) %>%
      dplyr::group_by_at(groupranks) %>%
      dplyr::summarise(sum = dplyr::n())

    tipsums <- lineage %>%
      dplyr::pull(sum)
    names(tipsums) <- lineage%>%
      dplyr::pull(!!summarise)

    lineage <- lineage %>%
      tidyr::unite(col=pathString, !!groupranks, sep="/") %>%
      dplyr::mutate(pathString = paste0("Root/", pathString))%>%
      data.tree::as.Node(.)

  } else if(isFALSE(summarise)){
    lineage <- lineage %>%
      mutate(Acc = Acc %>% str_remove("\\|.*$")) %>%
      tidyr::unite(col=pathString, !!ranks, Acc, sep="/") %>%
      dplyr::mutate(pathString = paste0("Root/", pathString)) %>%
      data.tree::as.Node(.)
  } else(
    stop("Summarise must refer to a taxonomic rank, or be FALSE")
  )

  if (output=="phylo"){
    nwk <- lineage  %>%
      data.tree::ToNewick(heightAttribute = NULL)
    out <-  phytools::read.newick(textConnection(nwk))
    if(methods::is(summarise, "character") & append_sum){
      out$tip.label <- paste0(out$tip.label, "-", tipsums)
      attr(out, "sum") <- tipsums
    }else if(methods::is(summarise, "character") & !append_sum){
      attr(out, "sum") <- tipsums
    }
  } else if (output=="treedf" & methods::is(summarise, "character")){
    out <- data.tree::ToDataFrameTree(lineage, "sum")
  } else if (output=="treedf" & !methods::is(summarise, "character")){
    out <- data.tree::ToDataFrameTree(lineage)
  } else if (output=="data.tree"){
    out <- lineage
  } else if (output =="newick"){
    out <- lineage  %>%
      data.tree::ToNewick(heightAttribute = NULL)
    if(methods::is(summarise, "character")){
      attr(out, "sum") <- tipsums
    }
  }

  time <- Sys.time() - time
  message(paste0("Generated a taxonomic tree for ", length(x), " Sequences in ", format(time, digits = 2)))

  return(out)
}


# tax2phylo ---------------------------------------------------------------
#' tax2phylo
#'
#' @description A newer, faster tax2tree that grafts tips. Only returns a phylo objec
#' @param x a DNAbin object or an object coercible to DNAbin
#' @param ranks The taxonomic ranks currently assigned to the names
#' @param depth The depth within the ranks to return. Default all to return lowest input rank
#' @param summarise Summarise the number of sequences at depth.
#' @param append_sum Append the summary number to the output trees tip labels when summarise is TRUE
#' @param hex Convert the accessions to hexadecimal format to avoid newick incompatible characters. Can be reversed with hex2acc
#' @param resolve_poly Randomly resolve polytomies in tree using `ape::multi2di`. use 'all' to resolve all polytomies, or 'upper' to only resolve upper branches, useful for making constraints trees
#'
#' @return
#' @export
#' @import stringr
#' @import dplyr
#' @importFrom tidyr unite
#' @importFrom magrittr set_colnames
#' @importFrom phytools read.newick
#' @importFrom data.tree as.Node
#' @importFrom data.tree ToNewick
#' @importFrom data.tree ToDataFrameTree
#' @importFrom ape as.DNAbin
#' @importFrom ape base.freq
#' @importFrom ape multi2di
#' @importFrom methods is
#'
tax2phylo <- function(x, ranks = c("kingdom", "phylum", "class", "order", "family", "genus", "species"), depth="all",
                      summarise = FALSE, append_sum = TRUE, hex=FALSE, resolve_poly=FALSE){

  # start timer
  time <- Sys.time()

  #Checks
  if (!methods::is(x, "DNAbin")) {
    x <- ape::as.DNAbin(x)
    if (all(is.na(ape::base.freq(x)))) {stop("Error: Object is not coercible to DNAbin \n")}
  }

  if(isTRUE(hex)){
    x <- acc2hex(x)
  }

  if(any(stringr::str_detect(names(x) %>% stringr::str_remove("\\|.*$"), "\\,|\\;|\\:|\\(|\\)|\\[|\\]"))) {
    stop("Sequence accessions contain one or more of the invalid characters , ; : ( ) [ ] charcters, set hex to TRUE to convert to hexadecimal encoding ")
  }

  if(is.character(resolve_poly) & !resolve_poly %in% c("upper", "all")){
    stop("resolve_poly must either be FALSE, 'upper', or 'all ")
  }

  # leave an autodetect for ranks?
  ranks <- stringr::str_to_lower(ranks)
  depth <- stringr::str_to_lower(depth)

  if (depth %in% ranks){
    groupranks <- ranks[1:match(depth, ranks)]
    message("Ranks included at depth ", depth," : ", paste(ranks, collapse=" "))
  } else if(depth==all){
    groupranks <- ranks
  } else {
    stop("Depth must be 'all' or one of the values in ranks")
  }

  #Get taxonomic lineage
  lineage <- names(x) %>%
    stringr::str_remove(pattern=";$") %>%
    stringr::str_split_fixed(";", n=Inf) %>%
    as.data.frame() %>%
    magrittr::set_colnames(c("Acc", ranks ))

  #lineage <- lineage[,!is.na(colnames(lineage))]

  if(isTRUE(summarise)){
    lineage <- lineage %>%
      dplyr::select(-Acc) %>%
      dplyr::group_by_at(groupranks) %>%
      dplyr::summarise(sum = dplyr::n())

    tipsums <- lineage %>%
      dplyr::pull(sum)
    names(tipsums) <- lineage%>%
      dplyr::pull(!!depth)

    tree <- lineage %>%
      tidyr::unite(col=pathString, !!groupranks, sep="/") %>%
      dplyr::mutate(pathString = paste0("Root/", pathString))%>%
      data.tree::as.Node(.) %>%
      data.tree::ToNewick(heightAttribute = NULL) %>%
      textConnection() %>%
      phytools::read.newick()

    #Append summaries of lower ranks
    if(append_sum){
      tree$tip.label <- paste0(tree$tip.label, "-", tipsums)
      attr(tree, "sum") <- tipsums
    }else if(!append_sum){
      attr(tree, "sum") <- tipsums
    }

    #Resolve polytomies in tree
    if(resolve_poly == "upper"){
      tree <- ape::multi2di(tree)
    }

  } else if(isFALSE(summarise)){
    # get mapping table for grafting
    mapping <- lineage %>%
      dplyr::mutate(Acc = Acc %>% str_remove("\\|.*$")) %>%
      dplyr::select(tail(groupranks,1), Acc)%>%
      mutate_all(~str_replace_all(.x," ", "_"))

    upper_tree <- lineage %>%
      dplyr::select(!!groupranks) %>%
      dplyr::distinct() %>%
      tidyr::unite(col=pathString, !!groupranks, sep="/") %>%
      dplyr::mutate(pathString = paste0("Root/", pathString)) %>%
      data.tree::as.Node(.,
                         table,
                         pathName = "pathString",
                         pathDelimiter = "/",
                         colLevels = NULL,
                         na.rm = TRUE,
                         check = "check")  %>%
      data.tree::ToNewick(heightAttribute = NULL) %>%
      textConnection() %>%
      phytools::read.newick()

    #Resolve polytomies in upper tree
    if(resolve_poly == "upper"){
      upper_tree <- ape::multi2di(upper_tree)
    }

    # Graft tips onto upper tree
    tree <- graft_tips(upper_tree, mapping)
  }else(
    stop("Summarise must be TRUE or FALSE")
  )
  #Resolve polytomies in tree
  if(resolve_poly == "all"){
    tree <- ape::multi2di(tree)
  }
  time <- Sys.time() - time
  message(paste0("Generated a taxonomic tree for ", length(x), " Sequences in ", format(time, digits = 2)))
  return(tree)
}


# graft_tips --------------------------------------------------------------
#' Graft new tips onto an existing tree
#'
#' @param tree A phylo object
#' @param mapping A mapping data frame containing two columns.
#' The left column must be identical to the tip labels of the phylogeny, the right column contains new tips to be grafted.
#' @param branch_length A numeric giving the branch lengths for the new polytomic tips that are added.
#'
#' @return
#' @export
#' @importFrom ape read.tree
#' @importFrom ape compute.brlen
#' @importFrom ape bind.tree
#'
#'
graft_tips <- function(tree, mapping, branch_length = 0){
  if (!inherits(tree, "phylo")) stop("'tree' is not of class 'phylo'")

  info <- apply(mapping, 2, setequal, y = tree$tip.label)
  if (!any(info)) stop("'mapping' is not congruent with tiplabels of 'tree'")

  #Split into current tips to be grafted
  mapping <- split(mapping[, !info], mapping[, info])
  add_tip <- function(z){
    z <- ape::read.tree(text = paste0("(", paste(z, collapse = ","), ");"))
    ape::compute.brlen(z, branch_length) ## set branch lengths to branch_length
  }
  combs <- lapply(mapping, add_tip)
  for (i in seq_along(combs)){
    tree <- ape::bind.tree(tree, combs[[i]], which(tree$tip.label == names(combs)[i]))
  }
  return(tree)
}


# Filter sequences by taxonomy --------------------------------------------
#' Filter sequences by taxonomy
#'
#' @param x A DNAbin or DNAString with heirarchial taxonomy
#' @param filt_rank The taxonomic rank to subset at i.e. 'Class'
#' @param filt_value A taxonomic name or vector of taxonomic names to subset filt_rank at at i.e. c('Insecta', 'Arachnida')
#' @param ranks The taxonomic ranks currently assigned to the sequences
#'
#' @return
#' @export
#' @import stringr
#' @importFrom tidyr unite
#' @import dplyr
#' @importFrom magrittr set_colnames
#'
#'
filter_by_tax <- function(x, filt_rank, filt_value, ranks=c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")){

  # Convert to DNAbin
  if (!class(x) %in% c("DNAbin", "DNAStringSet")  ) {
    stop("x must be a DNAbin or DNAstringset \n")
  }

  ranklength <- length(ranks)
  if(!stringr::str_count(names(x)[[1]] %>% stringr::str_remove(";$"), ";")== ranklength){
    stop("Error, the number of semicolon delimiters do not match the length of ranks")
  }

  filtnames <- names(x) %>%
    stringr::str_remove(";$") %>%
    stringr::str_split_fixed(";", n=(ranklength+1)) %>%
    as.data.frame(stringsAsFactors=FALSE) %>%
    magrittr::set_colnames(c("acc", tolower(ranks))) %>%
    dplyr::filter(!!sym(tolower(filt_rank)) %in% filt_value)%>%
    tidyr::unite(col="names", dplyr::everything(), sep=";") %>%
    pull(names)

  out <- x[names(x) %in% filtnames]

  if(length(filt_value) > 10){
    out_message <- paste0(paste(filt_value[1:10], collapse= ", "),"...[TRUNCATED]")
  } else(
    out_message <- paste(filt_value, collapse= ", ")
  )

  message(length(x) - length(out), " sequences which did not pass the filter '", filt_rank, " %in% ", out_message, "' removed")
  return(out)
}

# LCA probs ---------------------------------------------------------------

#' Probabilitys of sharing a rank as a function of sequence identity
#'
#' @param x a DNAbin object or an object coercible to DNAbin
#' @param method The distance matrix computation method to use,
#' accepts "mbed" which computes a distance matrix from each sequence to a subset of 'seed' sequences using the method outlined in Blacksheilds et al (2010).
#' This scales well to big datasets, alternatively "kdist" computes the full n * n distance matrix.
#' @param k integer giving the k-mer size used to generate the input matrix for k-means clustering.
#' @param nstart value passed to  nstart passed to kmeans. Higher increases computation time but can improve clustering accuracy considerably.
#' @param ranks The taxonomic ranks currently assigned to the names
#' @param delim The delimiter used between ranks
#'
#' @return
#' @export
#' @import stringr
#' @import dplyr
#' @importFrom kmer kdistance
#' @importFrom kmer mbed
#' @importFrom tidyr separate
#' @importFrom ape as.DNAbin
#' @importFrom utils combn
#'
lca_probs <- function(x, method="mbed",  k=5, nstart = 20, ranks=c("kingdom", "phylum", "class", "order", "family", "genus", "species"), delim=";"){
  # Convert to DNAbin
  if (!methods::is(x, "DNAbin")) {
    x <- ape::as.DNAbin(x)
  }

  # Replace any trailing delimiters
  names(x) <- names(x) %>%
    stringr::str_remove(";$")
  # Count number of remaining delimiters
  ndelim <- stringr::str_count(names(x)[1], ";")

  #check if delims match ranks
  if(any(ndelim <=1)) {
    stop("Error: Needs to be in hierarchial format first, run get_ncbi_lineage or get_ott_lineage")
  } else if (!ndelim==length(ranks)){
    stop("Error: Number of delimiters does not match number of ranks")
  }

  # Create distance matrix - could probably do this using random samples
  if(method == "kdist"){
    dist <- as.matrix(kmer::kdistance(x, k=k, method="edgar", residues="DNA"))
  } else if(method == "mbed"){
    dist <- as.matrix(kmer::mbed(x, k=k, residues="DNA")[,])
  }

  #convert to pairwise distances
  xy <- t(utils::combn(colnames(dist), 2))
  pw <- data.frame(xy, dist=(100 - round(dist[xy] * 100)))

  # subset to the values in loop

  sim <- sort(unique(pw$dist), decreasing = TRUE)
  simlist <- vector("list", length=length(sim))
  s=1
  for (s in 1:length(sim)) {

    subsets <- pw %>%
      filter(dist==sim[s])

    df1 <- subsets %>%
      tidyr::separate(X1, into=c("Acc",ranks), sep=";") %>%
      dplyr::select(rev(ranks))

    df2 <- subsets %>%
      tidyr::separate(X2, into=c("Acc",ranks), sep=";") %>%
      dplyr::select(rev(ranks))

    #Get all shared ranks
    logidf <- as.data.frame(df1 == df2)

    #Get lowest common rank
    keepvec <- apply(logidf, 1, which.max)
    rows <- seq(logidf[,1])
    #cols <- seq(logidf[1,])
    #unselect <- matrix(ncol=2,c(rep(rows, length(cols)), sort(rep(cols, length(rows)))))
    select <- matrix(ncol=2,c(rows, keepvec))

    logidf[select] <- "KEEP"
    logidf[!logidf=="KEEP"] <- 0
    logidf[logidf=="KEEP"] <- 1
    logidf <- logidf  %>%
      mutate_all(as.numeric) %>%
      colSums() / length(rows)

    simlist[[s]]  <- data.frame(rank=names(logidf), prob=logidf, sim=sim[s])

  }
  names(simlist) <- sim
  out <- dplyr::bind_rows(simlist) %>%
    dplyr::group_by(rank, sim) %>%
    dplyr::summarise(prob = mean(prob))
  return(out)
}


# Accessions from fasta ---------------------------------------------------
#' Genbank accessions from fasta
#'
#' @param x A filepath or vector of filepaths to fasta files
#'
#' @return
#' @export
#' @import dplyr
#' @importFrom Biostrings fasta.index
#'
acc_from_fasta <- function(x) {
  if(!all(file.exists(x))){
    stop("Input must be a filepath or vector of filepaths to fasta files")
  }
  out <- Biostrings::fasta.index(x) %>%
    dplyr::mutate(acc = desc %>%
                    stringr::str_remove(pattern="\\|.*$")) %>%
    dplyr::pull(acc)

  return(out)
}
alexpiper/taxreturn documentation built on Sept. 14, 2024, 7:56 p.m.