#' Count similar barcodes
#'
#' This function counts the number of similar barcodes from a specific file generated by Mothur.
#'
#' @param file a character string giving the path of the Mothur file which the data are to be read from.
#'
#' @export
countBC <- function(file){
tab <- readLines(file)
tab <- lapply(tab, function(x) strsplit(gsub("\t", ",", x), split = ","))
res <- sapply(tab, function(x) length(x[[1]]))
names(res) <- sapply(tab, function(x) x[[1]][1])
res <- res - 1
return(res)
}
#' Partitions for RAxML
#'
#' This function takes a vector of nucleotide positions (delimiting partitions) and format it for RAxML.
#'
#' @param part a vector of integers giving sequentially the partitions limits.
#'
#' @export
raxmlPart <- function(part){
npart <- length(part) / 2
part <- matrix(part, ncol = 2, byrow = TRUE)
part <- apply(part, 1, function(x) paste0(x, collapse = "-"))
part <- paste0("DNA,\tpart", 1:npart, " = ", part, collapse = "\n")
return(part)
}
#' Clean Database IDs from sequence names
#'
#' Clean Database IDs from sequence names (suffix separated by underscore).
#'
#' @param x a vector of character strings to process.
#'
#' @return A vector of character strings without database IDs.
#'
#' @export
cleanID8 <- function(x){
res <- gsub("_([^_]*)$", "", x)
return(res)
}
extractID8 <- function(x){
res <- gsub("^.*_", "", x)
return(res)
}
#' Clean taxonomic names
#'
#' This function applies a serie of text-processing steps to clean, simplify and normalize taxonomic names.
#'
#' @param x a \code{data.frame}.
#' @param column the column (index or name) of \code{x} containing the taxonomic names.
#'
#' @return A \code{data.frame}.
#'
#' @export
cleanTaxa <- function(x, column = 1){
rgx <- grep(" sp\\.|undetermined", x[ , column]) # Regex to supress undetermined species
if(length(rgx > 0)) x <- x[-rgx, ]
x[ , column] <- gsub("[[:space:]]+", " ", x[ , column]) # Regex to supress replicated spaces, tabs and new lines
x[ , column] <- gsub(" cf. | aff. ", " ", x[ , column]) # Regex to supress "cf." and "aff."
rgx <- intersect(grep(" var\\. | var ", x[ , column], invert = TRUE), # Regex to supress taxa with more than 2 words except varieties
grep(" [^ ]+ [^ ]+", x[ , column]))
if(length(rgx > 0)) x <- x[-rgx, ]
x[ , column] <- gsub(" var ", " var\\. ", x[ , column]) # Regex to replace "var" with "var."
x[ , column] <- gsub(" $", "", x[ , column]) # Regex to supress possible space at the end of the taxa name
x[ , column] <- tolower(x[ , column]) # Force lower case for all characters
x[ , column] <- paste(toupper(substr(x[ , column], 1L, 1L)),
substr(x[ , column], 2L, 10000L), sep = "") # Force upper case for the 1st characters
return(x)
}
#Trim missing sites
trim.DNAbin <- function(x){
x.char <- as.character(x)
res <- x[, apply(x.char, 2, function(x) !any(x == "-"))]
return(res)
}
#Get variable and complete sites
varchar.DNAbin <- function(x){
x.char <- as.character(x)
sel1 <- apply(x.char, 2, function(x) !any(x == "-"))
sel2 <- apply(x.char, 2, function(x) length(unique(x)) != 1)
res <- x[, sel1 & sel2]
return(res)
}
#agreem retourne le nombre de sites non "gap" pour deux sequences alignees
agreem <- function(x, y, weights = NULL){ #2 DNAbin class sequences
x.ind <- which(x != 4) #Les indices correspondant à "-" (gaps)
y.ind <- which(y != 4)
if(is.null(weights)){
weights <- rep(1, max(ncol(x), ncol(y)))
}
res <- sum(weights[union(x.ind, y.ind)])
return(res)
}
selectBestSeqs <- function(seqs){
di <- as.character(seqs)
dix <- apply(di , 2, function(x) sum(x != "-"))
dix <- dix/max(dix)
mp <- apply(seqs, 1, function(x)
apply(seqs, 1, function(y)
agreem(x, y, weights = dix)))
quant <- seq(0.01, 0.99, by = 0.01)
quant.tab <- matrix(NA, nrow = length(quant), ncol = 4)
colnames(quant.tab) <- c("Sites", "Variable sites", "Sequences", "Species")
rownames(quant.tab) <- quant
quant.tab.idx <- 1
for(qt in quant){
quant.sel.seq <- seqs[rowSums(mp) > quantile(rowSums(mp), probs = qt), ]
rownames(quant.tab)[quant.tab.idx] <- qt
quant.tab[quant.tab.idx, "Sites"] <- ncol(trim.DNAbin(quant.sel.seq))
quant.tab[quant.tab.idx, "Variable sites"] <- ncol(varchar.DNAbin(quant.sel.seq))
quant.tab[quant.tab.idx, "Sequences"] <- nrow(quant.sel.seq)
quant.tab[quant.tab.idx, "Species"] <- length(unique(cleanID8(labels(quant.sel.seq))))
quant.tab.idx <- quant.tab.idx + 1
}
plot(quant.tab[, "Species"], quant.tab[, "Variable sites"],
xlab = "Number of species", ylab = "Number of variable sites")
cat("\n")
print(quant.tab)
cat("Enter quantile to select sequences: ")
user.quant <- readLines(n = 1)
res <- seqs[rowSums(mp) > quantile(rowSums(mp), probs = as.numeric(user.quant)), ]
cat(quant.tab[user.quant, "Sequences"], "sequences selected;", quant.tab[user.quant, "Species"], "species")
return(res)
}
#' Keep on sequence per species
#'
#' Keep one sequence per species by consensus
#'
#' @param x a \code{DNAbin} alignment
#' @param taxa the taxa list vector. Should be as long as the number of sequences.
#' @param excl.regex a regular expression to keep some taxa as is. Useful for undeterminated species.
#'
#' @return
#' A \code{DNAbin} matrix.
#' @export
#'
one_seq_one_spec <- function(x, taxa = labels.DNAbin(x), excl.regex = "_sp\\."){
if(is.list(x)) x <- as.matrix.DNAbin(x)
if(is.null(excl.regex)){
sel <- rep_len(TRUE, length(taxa))
} else {
sel <- !stringr::str_detect(taxa, excl.regex)
}
excl_res <- x[!sel, ]
rownames(excl_res) <- taxa[!sel]
x <- x[sel, ]
taxa <- taxa[sel]
res <- list()
for(i in unique(taxa)){
sel_seq <- x[taxa == i, ]
sel_con <- seqinr::consensus(as.character(sel_seq))
res[[i]] <- sel_con
}
res <- as.DNAbin(res)
res <- as.matrix(res)
res <- rbind.DNAbin(excl_res, res)
}
# Consensus sequences with exclusion of "-"
# x DNAbin matrix
# NEED TO BE TESTED
consensus_extra <- function(x){
con_mat <- seqinr::consensus(as.character(x), method = "profile")
con_mat["-" , colSums(con_mat) - con_mat["-", ] > 0] <- 0
apply(con_mat, 2, function(x) row.names(con_mat)[which.max(x)])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.