R/summarizebylevel.R

#' Sum the sequences that belong to the same taxon.
#'
#' @param count.table The count table containing the quantification for each sequence.
#' @param taxonomy The taxonomic assignment produced by TAG.ME.
#' @param level The The taxon level to be summarized. Values are: "Domain", "Phylum", "Class", "Order", "Family", "Genus", or "Specie". Default value = "Genus". Default is FALSE.
#' @param taxa.are.rows Boolean value describing your count table organization. TRUE value means that your table contains your sequences (ASVs, OTUs) as rownames, and your samples are colnames - Ex: OTU tables generated by Qiime. FALSE value means that your sequences are the colnames and samples are the rownames - Ex: Seqtable produced by Dada2.
#'
#' @return A count table with the unique taxons as rownames and the summarized count for each.
#'
#' @export
summarize.by.level = function (count.table, taxonomy, level = "Genus", taxa.are.rows = FALSE) {

  require("stringr")

  count.table = as.data.frame(count.table)

  if (isTRUE(taxa.are.rows)) {
    count.table = data.frame(count.table)
  } else {
    count.table = data.frame(t(count.table))
  }

  splited = stringr::str_split_fixed(taxonomy$Taxonomy, pattern = ";", 7)
  splited = gsub(".__", "", splited)
  for (i in 1:dim(splited)[1]) {
    for (j in 1:7) {
      if (splited[i,j] != "") {
        last_assigned = splited[i,j]
      }
      else {
        splited[i,j] = paste("unclassified", last_assigned, sep = "_")
      }
    }
  }
  colnames(splited) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Specie")
  rownames(splited) = taxonomy$seqID

  merged = merge(count.table, data.frame(Taxon = splited[,level]), by = "row.names")
  merged = merged[,-1]
  size = ncol(merged)
  table_aggregate = aggregate(merged[,-size], by=list(as.factor(merged$Taxon)), FUN=sum)
  row.names(table_aggregate) = table_aggregate[,1]
  table_aggregate = table_aggregate[,-1]

  return(table_aggregate)

}
gabrielrfernandes/tagme documentation built on Dec. 25, 2021, 1:25 a.m.