CMETNGS: Convenience functions for CMET NGS automated pipeline

Documented in fasta2dataframe

#' Conversion of (mothur) fasta files to a dataframe
#'
#' @param fastaFile standard format DNA (not RNA/protein) fasta file
#' @param mothurrepseqs boolean indicating wether or not the file contains mothur-formatted headers for each sequence as generated by get.oturep. Defaults to TRUE.
#' @param striparg string to be kept as headers in more complex headers (allows for regex), ignored in case mothurrepseqs=TRUE (default)
#' @importFrom Biostrings readDNAStringSet
#' @examples
#' ## Short example
#'
#' # Load precomputed example data
#' #TODO: add export option
#'
#' @export


fasta2dataframe <- function(fastaFile,mothurrepseqs=TRUE,striparg="Otu"){
  ffread <- readDNAStringSet(fastaFile)
  SeqIDs <- names(ffread)
  if(mothurrepseqs==TRUE){
    SeqIDs <- sub(".*(Otu[0-9]+).*","\\1",SeqIDs)
  }else{
    if(striparg!="Otu"){
      replchar <- paste0(".*(",striparg,").+")
      SeqIDs <- sub(replchar,"\\1",SeqIDs)
    }else{
      SeqIDs <- sub(".*(Otu[0-9]+).*","\\1",SeqIDs)
    }
  }
  readseq <- paste(ffread)
  SeqID_seq<-data.frame(SeqIDs,readseq)
  return(SeqID_seq)
}