diatobc: Bioinformatic toolbox for my projects

#' Count similar barcodes 
#' 
#' This function counts the number of similar barcodes from a specific file generated by Mothur.
#' 
#' @param file a character string giving the path of the Mothur file which the data are to be read from.
#' 
#' @export
countBC <- function(file){
  tab <- readLines(file)
  tab <- lapply(tab, function(x) strsplit(gsub("\t", ",", x), split = ","))
  res <- sapply(tab, function(x) length(x[[1]]))
  names(res) <- sapply(tab, function(x) x[[1]][1])
  res <- res - 1
  return(res)
}


#' Partitions for RAxML
#' 
#' This function takes a vector of nucleotide positions (delimiting partitions) and format it for RAxML.
#' 
#' @param part a vector of integers giving sequentially the partitions limits.
#' 
#' @export
raxmlPart <- function(part){
  npart <- length(part) / 2
  part <- matrix(part, ncol = 2, byrow = TRUE)
  part <- apply(part, 1, function(x) paste0(x, collapse = "-"))
  part <- paste0("DNA,\tpart", 1:npart, " = ", part, collapse = "\n")
  return(part)
}


#' Clean Database IDs from sequence names
#' 
#' Clean Database IDs from sequence names (suffix separated by underscore).
#' 
#' @param x a vector of character strings to process.
#' 
#' @return A vector of character strings without database IDs.
#' 
#' @export
cleanID8 <- function(x){
  res <- gsub("_([^_]*)$", "", x)
  return(res)
}

extractID8 <- function(x){
  res <- gsub("^.*_", "", x)
  return(res)
}


#' Clean taxonomic names
#' 
#' This function applies a serie of text-processing steps to clean, simplify and normalize taxonomic names.
#' 
#' @param x a \code{data.frame}.
#' @param column the column (index or name) of \code{x} containing the taxonomic names.
#' 
#' @return A \code{data.frame}.
#' 
#' @export
cleanTaxa <- function(x, column = 1){
  rgx <- grep(" sp\\.|undetermined", x[ , column])                              # Regex to supress undetermined species
  if(length(rgx > 0)) x <- x[-rgx, ]
  x[ , column] <- gsub("[[:space:]]+", " ", x[ , column])                       # Regex to supress replicated spaces, tabs and new lines
  x[ , column] <- gsub(" cf. | aff. ", " ", x[ , column])                       # Regex to supress "cf." and "aff."
  rgx <- intersect(grep(" var\\. | var ", x[ , column], invert = TRUE),         # Regex to supress taxa with more than 2 words except varieties
                   grep(" [^ ]+ [^ ]+", x[ , column]))
  if(length(rgx > 0)) x <- x[-rgx, ]
  x[ , column] <- gsub(" var ", " var\\. ", x[ , column])                       # Regex to replace "var" with "var."
  x[ , column] <- gsub(" $", "", x[ , column])                                  # Regex to supress possible space at the end of the taxa name
  x[ , column] <- tolower(x[ , column])                                         # Force lower case for all characters
  x[ , column] <- paste(toupper(substr(x[ , column], 1L, 1L)),
                        substr(x[ , column], 2L, 10000L), sep = "")            # Force upper case for the 1st characters
  return(x)
}

#Trim missing sites
trim.DNAbin <- function(x){
  x.char <- as.character(x)
  res <- x[, apply(x.char, 2, function(x) !any(x == "-"))]
  return(res)
}

#Get variable and complete sites
varchar.DNAbin <- function(x){
  x.char <- as.character(x)
  sel1 <- apply(x.char, 2, function(x) !any(x == "-"))
  sel2 <- apply(x.char, 2, function(x) length(unique(x)) != 1)
  res <- x[, sel1 & sel2]
  return(res)
}

#agreem retourne le nombre de sites non "gap" pour deux sequences alignees
agreem <- function(x, y, weights = NULL){   #2 DNAbin class sequences
  x.ind <- which(x != 4)    #Les indices correspondant à "-" (gaps)
  y.ind <- which(y != 4)
  if(is.null(weights)){
    weights <- rep(1, max(ncol(x), ncol(y)))
  }
  res <- sum(weights[union(x.ind, y.ind)])
  return(res)
}


selectBestSeqs <- function(seqs){
  di <- as.character(seqs)
  dix <- apply(di , 2, function(x) sum(x != "-"))
  dix <- dix/max(dix)
  mp <- apply(seqs, 1, function(x)
    apply(seqs, 1, function(y)
      agreem(x, y, weights = dix)))
  quant <- seq(0.01, 0.99, by = 0.01)
  quant.tab <- matrix(NA, nrow = length(quant), ncol = 4)
  colnames(quant.tab) <- c("Sites", "Variable sites", "Sequences", "Species")
  rownames(quant.tab) <- quant
  quant.tab.idx <- 1
  for(qt in quant){
    quant.sel.seq <- seqs[rowSums(mp) > quantile(rowSums(mp), probs = qt), ]
    rownames(quant.tab)[quant.tab.idx] <- qt
    quant.tab[quant.tab.idx, "Sites"] <- ncol(trim.DNAbin(quant.sel.seq))
    quant.tab[quant.tab.idx, "Variable sites"] <- ncol(varchar.DNAbin(quant.sel.seq))
    quant.tab[quant.tab.idx, "Sequences"] <- nrow(quant.sel.seq)
    quant.tab[quant.tab.idx, "Species"] <- length(unique(cleanID8(labels(quant.sel.seq))))
    quant.tab.idx <- quant.tab.idx + 1
  }
  plot(quant.tab[, "Species"], quant.tab[, "Variable sites"],
       xlab = "Number of species", ylab = "Number of variable sites")
  cat("\n")
  print(quant.tab)
  cat("Enter quantile to select sequences: ")
  user.quant <- readLines(n = 1)
  res <- seqs[rowSums(mp) > quantile(rowSums(mp), probs = as.numeric(user.quant)), ]
  cat(quant.tab[user.quant, "Sequences"], "sequences selected;", quant.tab[user.quant, "Species"], "species")
  return(res)
}




#' Keep on sequence per species
#' 
#' Keep one sequence per species by consensus
#'
#' @param x a \code{DNAbin} alignment  
#' @param taxa the taxa list vector. Should be as long as the number of sequences.
#' @param excl.regex a regular expression to keep some taxa as is. Useful for undeterminated species.
#'
#' @return
#' A \code{DNAbin} matrix.
#' @export
#'
one_seq_one_spec <- function(x, taxa = labels.DNAbin(x), excl.regex = "_sp\\."){
  if(is.list(x)) x <- as.matrix.DNAbin(x)
  if(is.null(excl.regex)){
    sel <- rep_len(TRUE, length(taxa))
  } else {
    sel <- !stringr::str_detect(taxa, excl.regex)
  }
  
  excl_res <- x[!sel, ]
  rownames(excl_res) <- taxa[!sel]
  
  x <- x[sel, ]
  taxa <- taxa[sel]
  res <- list()
  for(i in unique(taxa)){
    sel_seq <- x[taxa == i, ]
    sel_con <- seqinr::consensus(as.character(sel_seq))
    res[[i]] <- sel_con
  }
  res <- as.DNAbin(res)
  res <- as.matrix(res)
  
  res <- rbind.DNAbin(excl_res, res)
}



# Consensus sequences with exclusion of "-"
# x DNAbin matrix
# NEED TO BE TESTED
consensus_extra <- function(x){ 
  con_mat <- seqinr::consensus(as.character(x), method = "profile")
  con_mat["-" , colSums(con_mat) - con_mat["-", ] > 0] <- 0
  apply(con_mat, 2, function(x) row.names(con_mat)[which.max(x)])
}