R/read_blast_databases.R

Defines functions read_blast_databases

Documented in read_blast_databases

#' @title Import BLAST output file
#' @description When performing BLAST searches with the \code{blast_*()} functions,
#' the corresponding BLAST output file can be imported into the current R session using this function.
#' All output formats given by BLAST are supported (see e.g. description for details).
#' @param file path to BLAST output file.
#' @param out.format a character string specifying the output format of the BLAST output that shall be imported.
#' Available options are:
#'  \itemize{
#'  \item \code{out.format = "postgres"} : store BLAST output as Postgres database and generate postgres database connection.
#'  \item \code{out.format = "xml"} : XML
#'  \item \code{out.format = "tsv"} : Tabular separated file
#'  \item \code{out.format = "csv"} : Comma-separated values
#'  \item \code{out.format = "json.seq.aln"} : Seqalign (JSON)
#'  \item \code{out.format = "json.blast.multi"} : Multiple-file BLAST JSON
#'  \item \code{out.format = "xml2.blast.multi"} : Multiple-file BLAST XML2
#'  \item \code{out.format = "json.blast.single"} : Single-file BLAST JSON
#'  \item \code{out.format = "xml2.blast.single"} : Single-file BLAST XML2
#'  }
#' @param postgres.user specify username for RPostgreSQL connection.
#' @author Hajk-Georg Drost
#' @seealso \code{\link{blast_protein_to_protein}}, \code{\link{blast_nucleotide_to_protein}},
#' \code{\link{blast_nucleotide_to_nucleotide}}
#' @export

read_blast_databases <- function(file, out.format, postgres.user = NULL) {
  
  if (!file.exists(file))
    stop("The BLAST output file '", file, "' does not exist! Please check what might have went wrong with the BLAST call.", call. = FALSE)
  
  if (!is.element(
    out.format,
    c(
      "postgres",
      "xml",
      "tsv",
      "csv",
      "json.seq.aln",
      "json.blast.multi",
      "xml2.blast.multi",
      "json.blast.single",
      "xml2.blast.single"
    )
  )
  )
  stop("Sorry, but '",out.format,"' is not an available import type. Please choose an 'out.format' that is supported by this function.", call. = FALSE)
  
  if (out.format == "postgres") {
    
    # # install local version of Spark if not available yet
    # if (nrow(sparklyr::spark_installed_versions()) == 0) {
    #     sparklyr::spark_install(version = spark_version)
    # }
    # 
    # # open local connection to spark
    # sparkconnect <- sparklyr::spark_connect(master = "local")
    # 
    # import_blast_tbl <- sparklyr::copy_to(sparkconnect, iris, 
    #                                       "spark_blast_tbl",
    #                                        overwrite = TRUE) 
    
    
    if (is.null(postgres.user))
      stop("Please specify a 'postgres.user' to import BLAST output into PostgresSQL database.", call. = FALSE)
    
    #require(RPostgreSQL)
    
    postgres_filename <- paste0(unlist(stringr::str_split(basename(file),"[.]"))[1],"_postgres")
    
    connect_db <-
      DBI::dbConnect(
        DBI::dbDriver("PostgreSQL"),
        user = postgres.user,
        password = "",
        host = "localhost",
        port = 5432, 
        dbname = postgres.user)
    
    DBI::dbWriteTable(
      connect_db,
      name      = postgres_filename,
      value     = file,
      row.names = FALSE,
      header    = FALSE,
      sep       = "\t",
      overwrite = TRUE
    )
    
    blast_sql_db <-
      dplyr::src_postgres(
        dbname = postgres.user,
        host = "localhost",
        port = 5432,
        user = postgres.user,
        password = ""
      )
    
    blast_postgres <-
      dplyr::tbl(blast_sql_db, postgres_filename)
    
    on.exit({
      #sparklyr::spark_disconnect(sparkconnect)
      DBI::dbDisconnect(connect_db)
      
    })
    
    return(blast_postgres)
  }     
  
  if (out.format == "csv") {
    blast_csv <- readr::read_delim(file = file, delim = ",", 
                                   col_names = FALSE,
                                   col_types = readr::cols(
                                     "X1" = readr::col_character(),
                                     "X2" = readr::col_character(),
                                     "X3" = readr::col_double(),
                                     "X4" = readr::col_integer(),
                                     "X5" = readr::col_integer(),
                                     "X6" = readr::col_integer(),
                                     "X7" = readr::col_integer(),
                                     "X8" = readr::col_integer(),
                                     "X9" = readr::col_integer(),
                                     "X10" = readr::col_double(),
                                     "X11" = readr::col_integer(),
                                     "X12" = readr::col_integer(),
                                     "X13" = readr::col_integer(),
                                     "X14" = readr::col_double(),
                                     "X15" = readr::col_double(),
                                     "X16" = readr::col_integer(),
                                     "X17" = readr::col_integer(),
                                     "X18" = readr::col_integer(),
                                     "X19" = readr::col_double(),
                                     "X20" = readr::col_number(),
                                     "X21" = readr::col_double(),
                                     "X22" = readr::col_character(),
                                     "X23" = readr::col_integer(),
                                     "X24" = readr::col_character(),
                                     "X25" = readr::col_character(),
                                     "X26" = readr::col_character(),
                                     "X27" = readr::col_character()
                                   ))
    if (nrow(blast_csv) > 0) {
      if (ncol(blast_csv) != length(blast_outfmt_colnames_databases()))
        stop("Tne number of blast output columns and the number of column names does not match! Please check what might have gone wrong.", call. = FALSE)
      colnames(blast_csv) <- blast_outfmt_colnames_databases()
      return(blast_csv)
    } else {
      message("Unfortunately, no BLAST hit was found for '",file, "'.")
      return(FALSE)
    }
    
  }    
  
}
HajkD/metablastr documentation built on Sept. 14, 2023, 5:26 p.m.