# Author: Walter Xie
# Accessed on 10 Mar 2016
# hasGroup, specify if values in the last column are groups, it affects how to process matrix
# hasGroup=TRUE, return a data frame by removing last column (groups),
# and another 1-column data frame for the last column (groups).
# return a list, where data frame community.matrix, cols are samples, rows are OTUs/taxa, no preprocessing
# data frame groups may be NULL depending on hasGroup
#' @name IOComMA
#' @title IO functions in \pkg{ComMA}
#'
#' @description IO functions for the data defined in \pkg{ComMA} package, such as community matrix.
#'
#' @details
#' \code{readCommunityMatrix} reads file to return a community matrix.
#'
#' @param file The file to read/write.
#' @param matrix.name The string to locate the matrix from its file name.
#' @param minAbund The minimum abundance threshold to remove rows/columns
#' by row/column sum of abundance. For exampe, if minAbund=2, then remove
#' all singletons appeared in only one sample. If minAbund=1,
#' then remove all empty rows/columns. Default to 2 (singletons).
#' But \code{postfix} is only used for naming, no data processed.
#' @param regex1,regex2 Use for \code{\link{gsub}(regex1, regex2, row.names)}
#' to remove or replace annotation from original labels.
#' Default to \code{regex1="(\\|[0-9]+)", regex2=""} for read??? but NULL to write???,
#' which removes size annotation seperated by "|".
#' @param ignore.case Refer to \code{\link{gsub}}.
#' @param col.name.decreasing Should the sort decreasing order of \code{colnames}
#' be TRUE? Refer to \code{\link{order}}. If NULL, do nothing.
#' Default to FALSE.
#' @keywords IO
#' @export
#' @examples
#' cm <- readCommunityMatrix("16S.txt", "16S")
#'
#' @rdname IOComMA
readCommunityMatrix <- function(file, matrix.name=NULL, minAbund=2,
regex1="(\\|[0-9]+)", regex2="", ignore.case=TRUE,
col.name.decreasing=FALSE, verbose=TRUE) {
community.matrix <- ComMA::readFile(file, verbose=verbose, msg.file=paste(matrix.name, "community matrix"),
msg.col="samples", msg.row="OTUs")
community.matrix <- ComMA::rmMinAbundance(community.matrix, minAbund)
# remove/replace annotation
if (! is.null(regex1))
rownames(community.matrix) <- gsub(regex1, regex2, rownames(community.matrix), ignore.case = ignore.case)
if (! is.null(col.name.decreasing)) {
community.matrix <- community.matrix[,order(colnames(community.matrix), decreasing=col.name.decreasing)]
}
attr(community.matrix,"name") <- matrix.name
return(community.matrix)
}
#' @details
#' \code{writeCommunityMatrix} writes a community matrix to file.
#'
#' @keywords IO
#' @export
#' @examples
#' writeCommunityMatrix(cm, "16S-new.txt", msg.file="16S")
#'
#' @rdname IOComMA
writeCommunityMatrix <- function(community.matrix, file,
msg.file="file", msg.col="columns", msg.row="rows",
regex1=NULL, regex2="", ignore.case=TRUE) {
# rm empty rows and columns
community.matrix <- ComMA::rmMinAbundance(community.matrix, minAbund=1)
community.matrix <- ComMA::rmMinAbundance(community.matrix, minAbund=1, MARGIN=2)
# remove/replace annotation
if (! is.null(regex1))
rownames(community.matrix) <- gsub(regex1, regex2, rownames(community.matrix),
ignore.case = ignore.case)
ComMA::writeTable(community.matrix, file, msg.file=msg.file, msg.col=msg.col, msg.row=msg.row)
}
#' @details
#' \code{readTaxaTable} reads a file to return a taxa table.
#' \code{\link{subsetTaxaTable}} can be used to takes or excludes
#' a subset of the taxa table recursively.
#'
#' @param taxa.group The taxonomic group, the values can be 'all', 'assigned', or
#' Group 'all' includes everything.
#' Group 'assigned' removes all uncertain classifications including
#' 'root', 'cellular organisms', 'No hits', 'Not assigned'.
#' Alternatively, any high-ranking taxonomy in your taxonomy file
#' can be used as a group or multi-groups (seperated by "|"),
#' such as 'BACTERIA', 'Proteobacteria', etc. But they have to be
#' in the same rank column in the file. Default to remove all
#' uncertain classifications, even when group(s) assigned.
#' @param rank The rank column in the file to specify where to
#' search for \code{taxa.group}.
#' @param include Define whether include or exclude given \code{taxa.group}.
#' Default to TRUE.
#' @keywords IO
#' @export
#' @examples
#' tt.megan <- readTaxaTable("16s-megan.txt", "16S taxa table", taxa.group="Bacteria")
#' tt.rdp.8 <- readTaxaTable("16s-rdp.txt", "16S taxa table", taxa.group="Bacteria")
#'
#' @rdname IOComMA
readTaxaTable <- function(file, matrix.name=NULL, taxa.group="assigned", rank="kingdom",
regex1="(\\|[0-9]+)", regex2="", ignore.case=TRUE,
include=TRUE, sep="\t", verbose=TRUE) {
taxa.table <- ComMA::readFile(file, sep=sep, verbose=verbose,
msg.file=paste(matrix.name, "taxonomy table"), msg.row="OTUs")
taxa.table <- taxa.table[order(rownames(taxa.table)),]
# make lower case to match ranks
colnames(taxa.table) <- tolower(colnames(taxa.table))
if (! is.element(rank, colnames(taxa.table)) )
stop("Cannot find rank column ", rank, "! Use createTaxaTable??? function to create taxa.table file.")
# remove/replace annotation
if (! is.null(regex1))
rownames(taxa.table) <- gsub(regex1, regex2, rownames(taxa.table), ignore.case = ignore.case)
n.taxa=nrow(taxa.table)
##### keep OTU rows contain given taxa belongTo #####
if (taxa.group != "all") {
# Exclude unassigned etc
taxa.table <- subset(taxa.table, !(grepl("root|cellular organisms|No hits|Not assigned", taxa.table[,"kingdom"], ignore.case = T)))
# for PROTISTS, taxa.group="CHROMISTA|PROTOZOA"
if (toupper(taxa.group) == "PROTISTS")
taxa.group="CHROMISTA|PROTOZOA"
if (taxa.group != "assigned") {
taxa.table <- ComMA::subsetTaxaTable(taxa.table, taxa.group=taxa.group, rank=rank, include=include)
}
}
if (nrow(taxa.table) < 1)
warning("Cannot find ", taxa.group, " at ", rank, " from taxa path file ", file, " !")
if(verbose && nrow(taxa.table) < n.taxa) {
cat("Select", nrow(taxa.table), "classifications, by given taxa.group =", taxa.group,
", rank =", rank, ", include =", include, ".\n")
}
return(taxa.table)
}
# "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus"
# "kingdom" is compulsory, "path" is optional
# c("16s-path.txt", "16s-kingdom.txt", "16s-phylum.txt", "16s-class.txt", "16s-order.txt", "16s-family.txt")
#' @details
#' \code{createTaxaTableMEGAN} reads a list of taxa mapping files exported
#' from MEGAN to create a taxa table in \code{folder.path}.
#'
#' @param file.prefix The prefix to find taxa mapping files,
#' such as "16s" for "16s-path.txt", "16s-kingdom.txt", "16s-phylum.txt".
#' @param folder.path The folder path to contain taxa mapping files
#' and output MEGAN taxa table.
#' @param col.names A vector of column names of taxonomic ranks in the taxa table,
#' and they also determine the files to be merged into columns. Default to
#' c("path", "kingdom", "phylum", "class", "order", "family", "genus"), which indicates
#' the list of files c("16s-path.txt", "16s-kingdom.txt", "16s-phylum.txt", "16s-class.txt",
#' "16s-order.txt", "16s-family.txt", "16s-genus.txt") given file.prefix = "16s". "path" is optional.
#' @param file.out The taxonomic table file name.
#' @keywords IO
#' @export
#' @examples
#' createTaxaTableMEGAN(getwd(), "16s", file.out="16s-megan.txt")
#'
#' @rdname IOComMA
createTaxaTableMEGAN <- function(folder.path, file.prefix, sep="\t",
regex1="(\\|[0-9]+)", regex2="", ignore.case=TRUE,
col.names=c("path", "kingdom", "phylum", "class", "order", "family", "genus"),
file.out="taxa-table-megan.txt") {
taxa.files <- paste0(file.prefix, "-", col.names, ".txt")
for (n in 1:length(col.names)) {
t.f <- file.path(folder.path, taxa.files[n])
# no header, no row.names in the file
df.taxa <- ComMA::readFile(t.f, sep=sep, header=FALSE, row.names=NULL)
if (ncol(df.taxa) < 2)
stop(paste("Taxa file", t.f, "can be correctly parsed ! Please check the file format."))
# remove " <phylumn>" added from MEGAN
df.taxa[,2] <- gsub("\\s+<.*>", "", df.taxa[,2])
# remove " (miscellaneous)"
df.taxa[,2] <- gsub("\\s+\\(miscellaneous\\)", "", df.taxa[,2])
colnames(df.taxa) <- c("OTUs",col.names[n])
if (n==1)
df.path <- df.taxa
else
df.path <- merge(df.path, df.taxa, by = "OTUs")
}
# remove/replace annotation
if (! is.null(regex1))
df.path[,"OTUs"] <- gsub(regex1, regex2, df.path[,"OTUs"], ignore.case = ignore.case)
taxa.f <- file.path(folder.path, file.out)
ComMA::writeTable(df.path, taxa.f, row.names = FALSE, msg.file = "taxa table")
}
#' @details
#' \code{createTaxaTableRDP} reads a 3-column taxa mapping file generated from RDP
#' to create a taxa table in \code{folder.path}. The row look like:
#' OTU1 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__;g__;s__ 0.970
#'
#' @param rm.rank.prefix Remove rank prefix, such as 'k__'. Default to TRUE.
#' @keywords IO
#' @export
#' @examples
#' createTaxaTableRDP("otus_tax_assignments.txt", file.out="16s-rdp.txt")
#'
#' @rdname IOComMA
createTaxaTableRDP <- function(file, sep="\t", rm.rank.prefix=TRUE,
regex1="(\\|[0-9]+)", regex2="", ignore.case=TRUE,
file.out="taxa-table-rdp.txt") {
df.taxa <- ComMA::readFile(file, sep=sep, header=FALSE, row.names=NULL)
if (ncol(df.taxa) < 3)
stop(cat("RDP output file", file, "can be correctly parsed !\nPlease check the file format."))
colnames(df.taxa) <- c("OTUs","path","confidence")
# rm []
df.taxa[,"path"] <- gsub("\\[|\\]", "", df.taxa[,"path"])
# rm Root; for early version
df.taxa[,"path"] <- gsub("Root;", "", df.taxa[,"path"], ignore.case = T)
if (rm.rank.prefix)
vector.path <- gsub("[a-z]__", "", df.taxa[,"path"])
else
vector.path <- df.taxa[,"path"]
cols.lin <- read.table(text = vector.path, sep = ";", colClasses = "character", fill=TRUE)
if (ncol(cols.lin) < 3)
stop(cat("RDP output file", file, ", 2nd column 'taxa path' needs at least 3 ranks !",
"\nFor example, k__;p__;c__;o__;f__;g__;s__"))
# fix to k__;p__;c__;o__;f__;g__;s__
colnames(cols.lin) <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")[1:ncol(cols.lin)]
df.path <- cbind(df.taxa, cols.lin)
# remove/replace annotation
if (! is.null(regex1))
df.path[,"OTUs"] <- gsub(regex1, regex2, df.path[,"OTUs"], ignore.case = ignore.case)
ComMA::writeTable(df.path, file.out, row.names = FALSE, msg.file = "taxa table")
}
# if input is a list, then set input.list=T to unwrap list(...) and get the original list
# if input is cm1, cm2, ..., then set input.list=F
unwrapInputList <- function(..., input.list=FALSE) {
cm.list <- list(...)
if (!input.list && is(cm.list[[1]], "list")) {
stop("Invaild input: find a list of cm, please set input.list=T !")
}
if (input.list && is(cm.list, "list"))
cm.list <- cm.list[[1]]
# validation
if (! (is(cm.list, "list") && length(cm.list) > 0) )
stop("Invaild input: either not a list, please set input.list=F, or empty input, length = ", length(cm.list), " !")
return(cm.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.