R/synonymRemover.r

#' A function that writes headers for the filtered FASTA database
#' @param i index of the sequence in the FASTA file that must be changed
#' @export
writeHeaders <- function(i){
  paste(database_filt[i,"superkingdom"], database_filt[i, "kingdom"],
        database_filt[i,"phylum"], database_filt[i,"class"], database_filt[i, "order"],
        database_filt[i,"family"], database_filt[i,"genus"], database_filt[i, "species"], sep=";")
}

#' A function to remove all non-ITS2 sequences and change any synonyms, both specified by CSVs
#' @param database_path Path of taxonomy database
#' @param fasta_path The path of the DADA2 formatted reference taxonomy database
#' @param to_remove_path Path of a CSV file containing names of sequences to remove
#' @param synonyms_path Path of a CSV specifying horse parasite synonym and official Lichtenfels 2008 name
#' @export
filter_and_remove_synonyms <- function(database_path, fasta_path, to_remove_path, synonyms_path) {

  date <- gsub("-", "_", Sys.Date())

  to_remove <- data.frame(lapply(read.csv(to_remove_path, header = F), as.character), stringsAsFactors = FALSE)
  colnames(to_remove) <- c("accn", "sequence")
  fasta <- readDNAStringSet(fasta_path)
  synonyms <- data.frame(lapply(read.csv(synonyms_path), as.character), stringsAsFactors=FALSE)

  filter_indices <- which(database$accn %in% to_remove$accn)
  database_filt <- as.data.frame(database[-filter_indices,])

  fasta_filt <- fasta[-filter_indices]

  database_filt$speciesOld <- database_filt$species       #Used to check if anything has changed
  database_filt$genusOld <- database_filt$genus           #Also to check if genera changed
  which(!database_filt$species == database_filt$speciesOld) #Should be 0, haven't changed anything yet
  which(!database_filt$genus == database_filt$genusOld) #Same as above

  #Change to official species names, and if species name is changed then also change genus name
  database_filt$species <- mapvalues(database_filt$species, from=synonyms$synonym_species_name, to=synonyms$species_name_used_by_Lichtenfels_2008, warn_missing = F)
  changed_species_indices <- which(!database_filt$species == database_filt$speciesOld)
  #Change genus name
  database_filt$genus <- mapvalues(database_filt$genus, from=synonyms$synonym_genus_name, to=synonyms$genus_name_used_by_Lichtenfels_2008, warn_missing = F)
  changed_genus_indices <- which(!database_filt$genusOld == database_filt$genus) #many genera changed?
  which(!database_filt$species == database_filt$speciesOld)
  which(!database_filt$genus == database_filt$genusOld)

  database_filt$speciesOld <- NULL ; database_filt$genusOld <- NULL
  database_filt$species <- sub(" ", "_", database_filt$species)

  #writexl::write_xlsx(database_filt, paste0("database_nr_v0_9_3_species_names_updated_", date,".xlsx"))

  synonym_indices <- c(changed_species_indices, changed_genus_indices)

  newHeaders <- unlist(lapply(synonym_indices, FUN=writeHeaders))
  fasta_filt@ranges@NAMES[synonym_indices] <- newHeaders
  fasta_filt@ranges@NAMES[synonym_indices]
  writeXStringSet(fasta_filt, paste0("dada2_nr_v0_9_3_species_names_updated_", date,".fasta"))

  }
sgavril/equinizeR documentation built on June 15, 2019, 2:44 p.m.