R/getAlignmentSignal.R

Defines functions getAlignmentSignal

Documented in getAlignmentSignal

#' Rboretum Alignment Signal Fetcher
#'
#' Given the path(s) to one or more alignments (and an optional list of taxa), this script returns split and base information for each site
#' @param alignment_path Where to find alignment files. Options include:
#' \itemize{
#'   \item A character vector of one or more alignment file paths  (relative or absolute)
#'   \item A path to a single directory containing all alignment files (relative or absolute)
#' }
#' @param species_info OPTIONAL: List of taxa to analyze [Default: Process alignment using all taxa if solo, all shared in multiple]. Can be provided as:
#' \itemize{
#'   \item phylo object from which species will be extracted; or
#'   \item multiPhylo object from which common species will be extracted; or
#'   \item Character vector with 3+ taxon IDs
#'   \item Semicolon-separated list of taxon IDs
#' }
#' @param use_gaps OPTIONAL: If FALSE, treat gaps as missing data (like N) [Default: TRUE, treat gaps as indel data]
#' @param alignment_name OPTIONAL: Chacter vector of names for each alignment. If missing or incomplete, the base filename is used
#' @param prefix OPTIONAL: If 'alignment_path' is a directory, provide a character vector of file prefixes (e.g. all alignment files start with "Mafft_")
#' @param suffix OPTIONAL: If 'alignment_path' is a directory, provide a character vector of file suffixes (e.g. all alignment files end with ".phy")
#' @param existing_signal OPTIONAL: Append these results to the output from getAlignmentSignal() run with the same species_info and a different alignment
#' @return Dataframe containing split and base information for each site in the alignment
#' @export

getAlignmentSignal <- function(alignment_path,species_info,use_gaps,alignment_name,prefix,suffix,existing_signal){
  
  # Ensure that a path and root taxa are provided as character vectors
  if(missing(alignment_path)){
    stop("No alignment file or directories indicated with 'alignment_path'")
  } else if(!is.character(alignment_path)){
    stop("'alignment_path' should be a character vector of file paths or the path to an alignment directory.")
  } 
  
  # Create regex search pattern in case a directory is given
  if(missing(prefix)){
    prefix <- c()
  } else if(is.null(prefix)){
    prefix <- c()    
  } else if(!is.character(prefix)){
    stop("'prefix' must be a character vector")
  } else{
    prefix <- unlist(purrr::map(.x=prefix,.f=function(x){paste(c("^",x),collapse = '')}))
    prefix <- paste(c("(",paste(prefix,collapse = "|"),")"),collapse = '')
  }
  
  if(missing(suffix)){
    suffix <- c()
  } else if(is.null(suffix)){
    suffix <- c()    
  } else if(!is.character(suffix)){
    stop("'suffix' must be a character vector")
  } else{
    suffix <- unlist(purrr::map(.x=suffix,.f=function(x){ifelse(substr(x,start = 1,stop = 1)==".",paste(c("\\",x,"$"),collapse = ''),paste(c(x,"$"),collapse = ''))}))
    suffix <- paste(c("(",paste(suffix,collapse = "|"),")"),collapse = '')
  }
  
  if(length(prefix)==0 & length(suffix)==0){
    align_regex <- ''
  } else if(length(prefix)>0 & length(suffix)==0){
    align_regex <- prefix
  } else if(length(prefix)==0 & length(suffix)>0){
    align_regex <- suffix
  } else if(length(prefix)>0 & length(suffix)>0){
    align_regex <- paste(paste(c(prefix,"(.*)",suffix),collapse = ""))
  }
  
  isFile <- file.exists(alignment_path) & !dir.exists(alignment_path)
  isDir <- dir.exists(alignment_path) & !isFile
  
  # Ensure files all exist
  if(length(alignment_path)==1){
    
    if(Rboretum::checkValidFiles(alignment_path)){ # 'alignment_path' points to a single valid file
      
      alignment_count <- 1
      alignment_path <- Rboretum::checkValidFiles(alignment_path,return_full_path = TRUE)
      default_name <- basename(alignment_path)
      
    } else if(isDir){ # 'alignment_path' points to a valid directory
      
      if(has_error(silent=TRUE,list.files(path=alignment_path,pattern=align_regex,full.names = TRUE,include.dirs = FALSE))){
        stop("Can't process file fetch. Check path or regex?")
      } else{
        
        alignment_path <- list.files(path=alignment_path,pattern=align_regex,full.names = TRUE,include.dirs = FALSE)
        
        if(length(alignment_path)==0){
          stop("Directory found, but no files identified in 'alignment_path'. Check regex?")
        } else if(length(alignment_path)==1){
          alignment_count <- 1
          default_name <-  basename(alignment_path)
        } else{
          alignment_count <- length(alignment_path)
          default_name <- purrr::map(alignment_path,.f = function(x){basename(x)}) %>% unlist() %>% as.character()
        }
      }
    } else{ stop("'alignment_path' points to neither a valid file or directory.") }
    
  } else{ # 'alignment_path' is a list of file paths
    
    file_check <- Rboretum::checkValidFiles(alignment_path) # Check that all paths in 'alignment_path' point to valid files
    
    if(!file_check){
      invalid_paths <- Rboretum::checkValidFiles(alignment_path,return_invalid = TRUE)
      print(invalid_paths)
      stop("The above paths from 'alignment_path' do not point to a valid file...")
    } else{
      alignment_path <- Rboretum::checkValidFiles(alignment_path,return_full_path = TRUE)
      alignment_count <- length(alignment_path)
      default_name <- purrr::map(alignment_path,.f = function(x){basename(x)}) %>% unlist() %>% as.character()
    }
  }
  
  # Set alignment names if requested, otherwise use default basenames
  if(missing(alignment_name)){
    alignment_name <- default_name
  } else if(length(alignment_name) != alignment_count){
    print(paste(c("'alignment_names' (",length(alignment_name),") and number of alignments (",alignment_count,") do not match...using default names..."),collapse = ''))
    alignment_name <- default_name
  }
  
  if(any(duplicated(alignment_name))){
    stop("'alignment_name' cannot contain duplicate values")
  }
  
  # Establish species of interest
  
  # If no species_info is provided, process alignments with taxa shared among all alignments
  if(missing(species_info)){
    if(alignment_count == 1){
      species_info = Rboretum::getAlignmentSpecies(alignment_path)
    } else{
      all_species <- purrr::map(.x=alignment_path,.f=function(x){Rboretum::semiVector(Rboretum::getAlignmentSpecies(x))})
      species_info <- Reduce(intersect, all_species) %>% Rboretum::vectorSemi() %>% rep(alignment_count)
    }
  } else if(Rboretum::isPhylo(species_info)){ # Get species from phylo
    species_info <- rep(Rboretum::semiSorter(species_info$tip.label),alignment_count)
  } else if(Rboretum::isMultiPhylo(species_info,check_three_taxa=TRUE)){ # Get shared species from multiPhylo
    species_info <- rep(Rboretum::semiSorter(Rboretum::getSharedTaxa(species_info)),alignment_count)
  } else if(is.character(species_info)){ # Get species from character vectors
    if(!Rboretum::semiChecker(species_info)){
      if(length(species_info)>=3){
        species_info <- rep(semiSorter(species_info),alignment_count)
      } else{ 
        stop("'species_info' contains fewer than 3 taxa...")
        }
    } else{
      if(length(species_info)==1){
        species_info <- semiSorter(species_info)
      } else if(length(species_info)>1){
        species_info <- purrr::map(.x=species_info,.f=function(x){semiVector(x)}) %>% unlist() %>% as.character() %>% unique() %>% naturalsort() %>% semiSorter()
      }
      if(length(semiVector(species_info))<3){
        stop("'species_info' contains fewer than 3 taxa...")
      } else{
        species_info <- rep(species_info,alignment_count)
      }
    }
  } else{
    stop("'species_info' should be a phylo, multiPhylo where trees share 3+ taxa, or a character vector with 3+ taxa")
  }
  
  # Get gap data
  if(missing(use_gaps)){
    gap_list <- rep("1",alignment_count) # Default, include gap data
  } else if(is.logical(use_gaps)){
    if(length(use_gaps)==1){
      if(use_gaps){
        gap_list <- rep("1",alignment_count)
      }else{
        gap_list <- rep("0",alignment_count)
      }
    } else{
      if(length(use_gaps)!=alignment_count){
        print(paste(c("'use_gaps' (",length(use_gaps),") and number of alignments (",alignment_count,") do not match...using default setting of including gaps..."),collapse = ''))
        gap_list <- rep("1",alignment_count)
      } else{
        gap_list <- purrr::map(.x=use_gaps,.f=function(x){ifelse(x,"1","0")}) %>% unlist()
      }
    } 
  } else{
    stop("'use_gaps' must be logical (single value or vector)")
  }
  
  if(alignment_count == 1){
    splits_df <- rb_run_align_splits(user_python,rb_lib_dir,alignment_path,species_info,gap_list,alignment_name) %>%
      mutate_if(is.list,as.character) %>%
      select(-index) %>%
      mutate_all(~na_if(as.character(.), 'NaN')) %>%
      rowwise() %>%
      mutate(All_Site_Bases = semiSorter(All_Site_Bases)) %>%
      ungroup()
  } else{
    splits_df = purrr::map(.x=1:alignment_count,.f=function(x){rb_run_align_splits(user_python,rb_lib_dir,alignment_path[x],species_info[x],gap_list[x],alignment_name[x])}) %>% do.call(rbind, .) %>%
      mutate_if(is.list,as.character) %>%
      select(-index) %>%
      mutate_all(~na_if(as.character(.), 'NaN')) %>%
      rowwise() %>%
      mutate(All_Site_Bases = semiSorter(All_Site_Bases)) %>%
      ungroup()
  }
  
  splits_df[is.na(splits_df)] <- NA
  
  if(nrow(splits_df)==0){
    stop("No data returned. Check alignments and taxa?")
  }
  
  if(missing(existing_signal)){ # If  not appending results, return results
    return(splits_df)
  } else{
    
    # Get current species_info
    if(alignment_count > 1){
      current_species <- semiSorter(species_info[[1]])
    } else{
      current_species <- semiSorter(species_info)
    }
    
    # Get taxa from existing_signal
    existing_taxa <- Rboretum::isAlignmentSignal(existing_signal,return_taxa = TRUE) %>% semiSorter()
    
    if(current_species != existing_taxa){ # Ensure existing_signal contains information about the same taxa
      print("Object passed as 'existing_signal' is failing the test isAlignmentSignal(existing_signal,species_info)...Returning results without appending...")
      return(splits_df)
    } else if(any(alignment_name %in% existing_signal$Alignment_Name)){ # Ensure that alignment names are all unique
      print("Object passed as 'existing_signal' contains data on an alignment named identically one processed here...Returning results without appending...")
      return(splits_df)
    } else{
      return(rbind(existing_signal,splits_df))
    }
  } 
}
BobLiterman/Rboretum documentation built on July 6, 2023, 7:46 p.m.