metablastr: Perform Massive Local BLAST Searches

Documented in extract_hit_seqs_from_genomes

#' @title Extract sequences of BLAST hits in respective genomes and store it as \code{fasta} file(s)
#' @description In some cases, users may wish to extract the sequences of the respective blast hit
#' stored in a \code{blast_tbl}. This function enables to quickly extract such sequences and store
#' them in one general or multiple genome specific \code{fasta} file(s).
#' @param blast_tbl a BLAST table generated with \code{\link{blast_genomes}}.
#' @param subject_genomes a vector containing file paths to the reference genomes that shall be queried (e.g. file paths returned by \code{\link[biomartr]{meta.retrieval}}).
#' @param file_name name of the fasta file that stores the BLAST hit sequences. This name will only be used when \code{separated_by_genome = FALSE}.
#' @param separated_by_genome a logical value indicating whether or not hit sequences from different genomes should be stored in the same
#' output \code{fasta} file \code{separated_by_genome = FALSE} (default) or in separate \code{fasta} files \code{separated_by_genome = TRUE}.
#' @param update shall an existing \code{file_name} file be overwritten (\code{update = TRUE}; Default) or shall blast hit sequences be appended to the existing file (\code{update = FALSE})? 
#' @param path a folder path in which corresponding \code{fasta} output files shall be stored.
#' @author Hajk-Georg Drost
#' @export
extract_hit_seqs_from_genomes <-
  function(blast_tbl,
           subject_genomes,
           file_name = NULL,
           separated_by_genome = FALSE,
           update = TRUE,
           path = NULL) {
    
    if (any(!file.exists(subject_genomes)))
      stop(
        "At least one of the genome paths seems not to exist. Please check that all paths correspond to the correct location of the genome files.",
        call. = FALSE
      )
    
    if (!is.null(file_name) & separated_by_genome)
      stop(
        "A file name can only be specified when separated_by_genome = 'FALSE'. Otherwise separate species specific fasta files are being generated.",
        call. = FALSE
      )
    
    message("Starting sequence extraction process ...")
    
    ifelse(
      separated_by_genome,
      seq_file_paths <-
        vector("character", 1),
      seq_file_paths <- vector("character", length(subject_genomes))
    )
    
    species <- s_strand <- subject_id <- NULL
    
    available_species <- names(table(blast_tbl$species))
    
    if (separated_by_genome) {
      for (i in seq_len(length(subject_genomes))) {
        # remove appendix *.fa from file name
        species_refined_name <-
          unlist(stringr::str_split(basename(subject_genomes[i]), "[.]"))[1]
        
        message("Processing organism ", species_refined_name, " ...")
        if (!is.element(species_refined_name, available_species)) {
          message("Organism ",
                  species_refined_name,
                  " didn't have any BLAST hits.")
        } else {
          imported_genome_i <- biomartr::read_genome(subject_genomes[i])
          
          species_specific_blast_tbl <-
            dplyr::filter(blast_tbl, species == species_refined_name)
          
          if (nrow(species_specific_blast_tbl) > 0) {
            strand <-
              ifelse(
                species_specific_blast_tbl$s_end - species_specific_blast_tbl$s_start + 1 > 0,
                "plus",
                "minus"
              )
            
            species_specific_blast_tbl <-
              dplyr::mutate(species_specific_blast_tbl, s_strand = strand)
            
            species_specific_blast_tbl$subject_id <- unlist(lapply(stringr::str_trim(species_specific_blast_tbl$subject_id, side = "both"), function(x) {
              new_name <- unlist(stringr::str_split(x, " "))[1]
              new_name <- unlist(stringr::str_replace_all(new_name,"[.]","_"))
              return(new_name)
            }))
            
            imported_genome_i_names <-
              unlist(lapply(stringr::str_trim(names(imported_genome_i), side = "both"), function(x) {
                new_name <- unlist(stringr::str_split(x, " "))[1]
                new_name <- unlist(stringr::str_replace_all(new_name,"[.]","_"))
                return(new_name)
              }))
            
            names(imported_genome_i) <- imported_genome_i_names
            
            # only retain chromosome names that are present in both: genome and BLAST table
            chr_names <-
              dplyr::intersect(imported_genome_i_names, names(table(
                species_specific_blast_tbl$subject_id
              )))
            chr_names <- as.character(chr_names)
            
            if (length(chr_names) == 0)
              stop(
                "It seems that the chromosome names used in the input 'blast_tbl' and in the input genome fasta file do not match. Please make sure that the chromosome names match in both cases.",
                call. = FALSE
              )
            
            # construct output file paths
            seq_file_paths[i] <- ifelse(is.null(path),
                                        file.path(
                                          getwd(),
                                          paste0(subject_genomes[i], "_blast_tbl_sequences.fa")
                                        ),
                                        file.path(
                                          path,
                                          paste0(subject_genomes[i], "_blast_tbl_sequences.fa")
                                        ))
            
            if (update & file.exists(seq_file_paths[i]))
              fs::file_delete(seq_file_paths[i])
            
            # for each chromosome separately
            for (j in seq_len(length(chr_names))) {
              message("  -> Extracting BLAST hit sequences from ",
                      chr_names[j],
                      " ...")
              # divide by plus and minus strand
              
              species_specific_blast_tbl_plus_strand <-
                dplyr::filter(species_specific_blast_tbl,
                              subject_id == chr_names[j],
                              s_strand == "plus")
              species_specific_blast_tbl_minus_strand <-
                dplyr::filter(
                  species_specific_blast_tbl,
                  subject_id == chr_names[j],
                  s_strand == "minus"
                )
              
              if (nrow(species_specific_blast_tbl_plus_strand) > 0 & nrow(species_specific_blast_tbl_minus_strand) > 0) {
                if (!identical(
                  unique(species_specific_blast_tbl_plus_strand$s_len),
                  imported_genome_i[chr_names[j]]@ranges@width
                ))
                  stop(
                    "The chromosome length of ",
                    chr_names[j],
                    " (",
                    imported_genome_i[chr_names[j]]@ranges@width,
                    ") does not match the chromosome length s_len in the blast_tbl (",
                    unique(species_specific_blast_tbl_plus_strand$s_len),
                    "). Thus, BLAST hit sequences cannot be extracted due to indexing shifts...",
                    call. = FALSE
                    
                  )
              }
              
              if (nrow(species_specific_blast_tbl_plus_strand) > 0) {
                seqs_plus <- Biostrings::extractAt(
                  imported_genome_i[chr_names[j]],
                  at = IRanges::IRanges(
                    start = species_specific_blast_tbl_plus_strand$s_start,
                    end = species_specific_blast_tbl_plus_strand$s_end,
                    names = paste0(
                      species_specific_blast_tbl_plus_strand$species,
                      "_",
                      species_specific_blast_tbl_plus_strand$subject_id,
                      "_",
                      species_specific_blast_tbl_plus_strand$s_start,
                      "_",
                      species_specific_blast_tbl_plus_strand$s_end,
                      "_strand_plus"
                    )
                  )
                )
                
                # store sequences in fasta files
                Biostrings::writeXStringSet(
                  seqs_plus@unlistData,
                  filepath = ifelse(
                    is.null(path),
                    file.path(
                      getwd(),
                      paste0(subject_genomes[i], "_blast_tbl_sequences.fa")
                    ),
                    file.path(
                      path,
                      paste0(subject_genomes[i], "_blast_tbl_sequences.fa")
                    )
                  ),
                  format = "fasta",
                  append = TRUE
                )
              }
              
              if (nrow(species_specific_blast_tbl_minus_strand) > 0) {
                seqs_minus <- Biostrings::extractAt(
                  Biostrings::reverse(imported_genome_i[chr_names[j]]),
                  at = IRanges::IRanges(
                    start = imported_genome_i[chr_names[j]]@ranges@width - species_specific_blast_tbl_minus_strand$s_start + 1,
                    end = imported_genome_i[chr_names[j]]@ranges@width - species_specific_blast_tbl_minus_strand$s_end + 1,
                    names = paste0(
                      species_specific_blast_tbl_minus_strand$species,
                      "_",
                      species_specific_blast_tbl_minus_strand$subject_id,
                      "_",
                      species_specific_blast_tbl_minus_strand$s_start,
                      "_",
                      species_specific_blast_tbl_minus_strand$s_end,
                      "_strand_minus"
                    )
                  )
                )
          
                Biostrings::writeXStringSet(
                  seqs_minus@unlistData,
                  filepath = ifelse(
                    is.null(path),
                    file.path(
                      getwd(),
                      paste0(subject_genomes[i], "_blast_tbl_sequences.fa")
                    ),
                    file.path(
                      path,
                      paste0(subject_genomes[i], "_blast_tbl_sequences.fa")
                    )
                  ),
                  format = "fasta",
                  append = TRUE
                )
              }
            }
          }
        }
      } 
    } else {
      if (is.null(file_name)) {
        warning(
          "No file name was specified, thus the default file name 'blast_hit_seqs.fa' was used.",
          call. = FALSE
        )
        file_name <- "blast_hit_seqs.fa"
      }
      
      seq_file_paths <- ifelse(is.null(path),
                               file.path(getwd(), file_name),
                               file.path(path, file_name))
      
      if (update & file.exists(seq_file_paths))
        fs::file_delete(seq_file_paths)
      
      for (i in seq_len(length(subject_genomes))) {
        message("Processing organism ",
                basename(subject_genomes[i]),
                " ...")
        # remove appendix *.fa from file name
        species_refined_name <-
          unlist(stringr::str_split(basename(subject_genomes[i]), "[.]"))[1]
        
        if (!is.element(species_refined_name, available_species)) {
          message("Organism ",
                  species_refined_name,
                  " didn't have any BLAST hits.")
        } else {
          imported_genome_i <- biomartr::read_genome(subject_genomes[i])
          
          
          species_specific_blast_tbl <-
            dplyr::filter(blast_tbl, species == species_refined_name)
          
          if (nrow(species_specific_blast_tbl) > 0) {
            strand <-
              ifelse(
                species_specific_blast_tbl$s_end - species_specific_blast_tbl$s_start + 1 > 0,
                "plus",
                "minus"
              )
            species_specific_blast_tbl <-
              dplyr::mutate(species_specific_blast_tbl, s_strand = strand)
            
            species_specific_blast_tbl$subject_id <- unlist(lapply(stringr::str_trim(species_specific_blast_tbl$subject_id, side = "both"), function(x) {
              new_name <- unlist(stringr::str_split(x, " "))[1]
              new_name <- unlist(stringr::str_replace_all(new_name,"[.]","_"))
              return(new_name)
            }))
        
            imported_genome_i_names <-
              unlist(lapply(stringr::str_trim(names(imported_genome_i), side = "both"), function(x) {
                new_name <- unlist(stringr::str_split(x, " "))[1]
                new_name <- unlist(stringr::str_replace_all(new_name,"[.]","_"))
                return(new_name)
              }))
            
            names(imported_genome_i) <- imported_genome_i_names
            
            # only retain chromosome names that are present in both: genome and BLAST table
            chr_names <-
              dplyr::intersect(imported_genome_i_names, names(table(
                species_specific_blast_tbl$subject_id
              )))
            chr_names <- as.character(chr_names)
            
            if (length(chr_names) == 0)
              stop(
                "It seems that the chromosome names used in the input 'blast_tbl' and in the input genome fasta file do not match. Please make sure that the chromosome names match in both cases.",
                call. = FALSE
              )
            
            # for each chromosome separately
            for (j in seq_len(length(chr_names))) {
              message("  -> Extracting BLAST hit sequences from ",
                      chr_names[j],
                      " ...")
              
              # divide by plus and minus strand
              species_specific_blast_tbl_plus_strand <-
                dplyr::filter(species_specific_blast_tbl,
                              subject_id == chr_names[j],
                              s_strand == "plus")
              species_specific_blast_tbl_minus_strand <-
                dplyr::filter(
                  species_specific_blast_tbl,
                  subject_id == chr_names[j],
                  s_strand == "minus"
                )
              
              if (nrow(species_specific_blast_tbl_plus_strand) > 0 & nrow(species_specific_blast_tbl_minus_strand) > 0) {
                if (!identical(
                  as.integer(unique(species_specific_blast_tbl_plus_strand$s_len)),
                  as.integer(imported_genome_i[chr_names[j]]@ranges@width)
                ))
                  stop(
                    "The chromosome length of ",
                    chr_names[j],
                    " (",
                    imported_genome_i[chr_names[j]]@ranges@width,
                    ") does not match the chromosome length s_len in the blast_tbl (",
                    unique(species_specific_blast_tbl_plus_strand$s_len),
                    "). Thus, BLAST hit sequences cannot be extracted due to indexing shifts...",
                    call. = FALSE
                    
                  )
              }
              
              if (nrow(species_specific_blast_tbl_plus_strand) > 0) {
                seqs_plus <- Biostrings::extractAt(
                  imported_genome_i[chr_names[j]],
                  at = IRanges::IRanges(
                    start = species_specific_blast_tbl_plus_strand$s_start,
                    end = species_specific_blast_tbl_plus_strand$s_end,
                    names = paste0(
                      species_specific_blast_tbl_plus_strand$species,
                      "_",
                      species_specific_blast_tbl_plus_strand$subject_id,
                      "_",
                      species_specific_blast_tbl_plus_strand$s_start,
                      "_",
                      species_specific_blast_tbl_plus_strand$s_end,
                      "_strand_plus"
                    )
                  )
                )
                Biostrings::writeXStringSet(
                  seqs_plus@unlistData,
                  filepath = ifelse(
                    is.null(path),
                    file.path(getwd(), file_name),
                    file.path(path, file_name)
                  ),
                  format = "fasta",
                  append = TRUE
                )
              }
              
              if (nrow(species_specific_blast_tbl_minus_strand) > 0){
                seqs_minus <- Biostrings::extractAt(
                  Biostrings::reverse(imported_genome_i[chr_names[j]]),
                  at = IRanges::IRanges(
                    start = imported_genome_i[chr_names[j]]@ranges@width - species_specific_blast_tbl_minus_strand$s_start + 1,
                    end = imported_genome_i[chr_names[j]]@ranges@width - species_specific_blast_tbl_minus_strand$s_end + 1,
                    names = paste0(
                      species_specific_blast_tbl_minus_strand$species,
                      "_",
                      species_specific_blast_tbl_minus_strand$subject_id,
                      "_",
                      species_specific_blast_tbl_minus_strand$s_start,
                      "_",
                      species_specific_blast_tbl_minus_strand$s_end,
                      "_strand_minus"
                    )
                  )
                )
                Biostrings::writeXStringSet(
                  seqs_minus@unlistData,
                  filepath = ifelse(
                    is.null(path),
                    file.path(getwd(), file_name),
                    file.path(path, file_name)
                  ),
                  format = "fasta",
                  append = TRUE
                )
              } 
            }
          }
        }
      }
    }
    message("Sequence extraction process for all organisms finished without any problems.")
    return(seq_file_paths)
  }