R/multiple_fasta_files.R

Defines functions multiple_fasta_files

Documented in multiple_fasta_files

#' A function to write multiple fasta files from a dataframe, grouping based on different variables.
#'
#' @param dataframe A DataFrame containing the \code{seq_col}, the \code{header_col} and \code{group_by_cols}.
#' @param group_by_cols A character vector with the names of the columns whose values are used to group by.
#' @param seq_col A string with the name of the column containing the fasta sequence.
#' @param header_col A string with the name of the column containing the fasta header.
#' @param toWrite A boolean, if \code{toWrite} is TRUE, fasta files are written.
#' @param path Path where files are written if \code{toWrite} is TRUE.
#'
#' @return A list of lists containing both the DataFrame grouped based on \code{group_by_cols} and the name of the file to write.
#' @export
#'
multiple_fasta_files <- function(dataframe, group_by_cols,
                                 seq_col = "fasta_seq",
                                 header_col = "fasta_header",
                                 toWrite=FALSE,
                                 path = "."){
  ## Check arguments
  if(!is.character(group_by_cols))
    stop("'group_by_cols' should be character")
  if(!is.character(seq_col))
    stop("'seq_col' should be character")
  if(!is.character(header_col))
    stop("'header_col' should be character")
  if(!is.character(path))
    stop("'path' should be character")
  if(length(seq_col) != 1)
    stop("'seq_col' should be length 1")
  if(length(header_col) != 1)
    stop("'header_col' should be length 1")
  if(length(path) != 1)
    stop("'path' should be length 1")
  if(!all(group_by_cols %in% colnames(dataframe)))
    stop("'group_by_cols' are not contained in dataframe")

  seq_col <- rlang::sym(seq_col)
  header_col <- rlang::sym(header_col)

  #Comprobamos si existe el directorio principal: Por ejemplo, fascinas 1
  if (toWrite & !dir.exists(file.path(path)))
  {dir.create(file.path(path))}
  # Iteramos a lo largo de las distintas columnas que serán subdirectorios
  df_list <- purrr::map(group_by_cols, function(x){
    #Comprobamos si existe el subdirectorio
    if (toWrite & !dir.exists(file.path(path, x)))
    {dir.create(file.path(path, x))}
    #Destacamos los valores únicos de esa columna
    dataframe %>%
      tidyr::drop_na(x) %>%
      dplyr::pull(x) %>% unique()%>%
      #Creamos una lista con el dataframe y su path
      purrr::map(function(y){
        var_column <- rlang::sym(x)
        dataframe %>%
          dplyr::filter(!!var_column == y) %>%
          dplyr::distinct(!!seq_col, .keep_all = TRUE)%>%
          dplyr::select(!!seq_col, !!header_col) %>%
          list(file.path(path, x, paste0(y, ".fasta"))) %>% return()
      })}) %>% purrr::flatten() #Eliminamos un nivel de jerarquía
  if (toWrite){
    # Escribimos los archivos
    df_list %>% purrr::walk(function(z){
      seqinr::write.fasta(open = "w",
                            sequences = as.list(z[[1]] %>% dplyr::pull(seq_col)),
                            names = z[[1]] %>% dplyr::pull(header_col),
                            nbchar = 80, as.string = TRUE,
                            file.out = z[[2]])# filepath
    })
  }
  return(df_list)
}
currocam/FascinRSCA documentation built on March 21, 2022, 6:29 a.m.