# Extract OTUs or reads from CM and taxa classification
#
# Author: Walter Xie
# Accessed on 20 Apr 2016
#source("http://bioconductor.org/biocLite.R")
#biocLite("ShortRead")
#browseVignettes("ShortRead")
# extract sequences from given otus file whose names match given rownames(cm)
#' @name utilsSeq
#' @title Utils to manipulate sequences or OTUs
#'
#' @description Utils to manipulate sequences or OTUs,
#' such as extract OTUs given the subset of OTU names.
#' They are depended on Bioconductor packages
#' \url{http://www.bioconductor.org/}.
#'
#' @details
#' \code{subsetSequences} returns the subset of OTUs matching given OTU names.
#' It is depended on \code{\link{ShortRead}} package. Follow the instruction
#' \url{https://bioconductor.org/packages/release/bioc/html/ShortRead.html} to install.
#'
#' @param in.file The fasta file of OTU representive sequences.
#' Read by \code{\link{readFasta}}.
#' @param out.file The output fasta file containing extrated sequences.
#' @param otus.names The vector of names to match sequence labels in \code{in.file}.
#' @param regex1,regex2 Use for \code{\link{gsub}(regex1, regex2, id(fasta))}
#' to remove or replace annotation from original labels.
#' Default to \code{regex1="(\\|[0-9]+)", regex2=""} in \code{subsetSequences},
#' which removes size annotation seperated by "|",
#' but NA in \code{renameFastaID}, which does nothing.
#' @param ignore.case Refer to \code{\link{gsub}}.
#' @param max.seq Give the number (\code{max.seq}) of seleceted sequences,
#' if extracted sequences \code{> max.seq}, then choose \code{max.seq} sequences randomly from them.
#' Defaul to 0 to ignore it.
#' @keywords util
#' @export
#' @examples
#' subsetSequences(in.file, otus.names, out.file)
#'
#' @rdname utilsSeq
subsetSequences <- function(in.file, out.file, otus.names=c(), regex1="(\\|[0-9]+)",
regex2="", ignore.case = TRUE, max.seq=0) {
require(ShortRead)
otus.fasta <- readFasta(in.file)
if (!is.na(regex1)) {
# remove/replace annotation
otus.fasta <- ComMA::renameFastaID(otus.fasta, regex1=regex1, regex2=regex2, ignore.case=ignore.case)
}
fasta.id <- as.character(id(otus.fasta))
# extract seq by otus.names
if (length(otus.names) > 0) {
if (! is(otus.names, "character"))
otus.names <- as.character(otus.names)
final.fasta <- otus.fasta[(fasta.id %in% otus.names),]
cat("Extract", length(final.fasta), "sequences from total", length(otus.fasta),
"by given", length(otus.names), "sequence names.\n")
if (length(final.fasta) != length(otus.names))
warning("The extracted sequences ", length(final.fasta), " != ", length(otus.names), " given otus.names !\n")
} else {
final.fasta <- otus.fasta
}
# random select max.seq sequences
if (max.seq > 1 && length(final.fasta) > max.seq) {
final.id <- sample(1:length(final.fasta), max.seq, replace=F)
final.fasta <- final.fasta[final.id]
cat("Over max limit of sequences ", max.seq, ", so as to randomly extract", length(final.fasta), "sequences from them.\n")
}
writeFasta(final.fasta, out.file)
}
#' @details
#' \code{renameFastaID} renames the sequences id loaded from a fasta file
#' using \code{\link{ShortRead}} package.
#'
#' @param fasta The fasta object returned by \code{\link{readFasta}}.
#' @keywords util
#' @export
#' @examples
#' fasta <- renameFastaID(fasta)
#'
#' @rdname utilsSeq
renameFastaID <- function(fasta, regex1=NA, regex2="", ignore.case=TRUE) {
require(ShortRead)
if (! is(fasta, "ShortRead") )
stop("The input has to be ShortRead object !\n",
"http://bioconductor.org/packages/release/bioc/html/ShortRead.html")
if (! is.na(regex1)) {
fasta.id <- gsub(regex1, regex2, id(fasta), ignore.case = ignore.case)
# change to new id
fasta.id <- BStringSet(fasta.id)
fasta@id <- fasta.id
}
return(fasta)
}
#' @details
#' \code{rmDuplicateSeq} removes duplicate sequences or alginments.
#'
#' @keywords util
#' @export
#' @examples
#' rmDuplicateSeq("alg.fasta", "unique-alg.fasta")
#'
#' @rdname utilsSeq
rmDuplicateSeq <- function(in.file, out.file="unique-alg.fasta") {
require(ShortRead)
otus.alg <- readFasta(in.file)
cat("Find", length(srduplicated(otus.alg)[srduplicated(otus.alg)==TRUE]), "duplicate sequences from", in.file, "\n")
#id(otus.alg)[srduplicated(otus.alg)]
otus.alg.unique <- otus.alg[srduplicated(otus.alg)==FALSE]
writeFasta(otus.alg.unique, out.file)
}
#' @details
#' \code{getTaxaMap} returns a data frame of mapping file to annotate a tree,
#' such as \code{\link{annotateRAXMLTree}}.
#'
#' @param trait.name,trait.value The trait to annotate a tree.
#' Its format in the string will look like '[trait.name=trait.value]'.
#' @keywords util
#' @export
#' @examples
#' taxa.map <- getTaxaMap("16s-otus-Bacteria.fasta", trait.name="taxon", trait.value="Bacteria")
#'
#' @rdname utilsSeq
getTaxaMap <- function(in.file, trait.name="taxon", trait.value="Bacteria", regex1=NA, regex2="", ignore.case=TRUE) {
require(ShortRead)
otus.fasta <- readFasta(in.file)
# rename the sequences id if regex1 not NA
otus.fasta <- ComMA::renameFastaID(otus.fasta, regex1=regex1, regex2=regex2, ignore.case=ignore.case)
# 1st col is tip.label
taxa.map <- data.frame(tip.label=as.character(id(otus.fasta)), stringsAsFactors = F)
taxa.map[,trait.name] <- trait.value
return(taxa.map)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.