hpgltools: A pile of (hopefully) useful R functions

Documented in download_microbesonline_files get_microbesonline_taxid

## annotation_micbrobesonline.r: Extract bacterial annotations from
## microbesonline.  I like microbesonline quite a lot!  Their MySQL interface is
## a little clunky, so I rewrote these functions to just webscrape it.

#' Download the various file formats from microbesoline.
#'
#' Microbesonline provides an interesting set of file formats to download.  Each
#' format proves useful under one condition or another, ergo this defaults to
#' iterating through them all and getting every file.
#'
#' @param id Species ID to query.
#' @param type File type(s) to download, if left null it will grab the genbank,
#'  tab, protein fasta, transcript fasta, and genome.
#' @return List describing the files downloaded and their locations.
download_microbesonline_files <- function(id = "160490", type = NULL) {
  retlist <- list()
  gbk <- FALSE
  if (type == "gbk") {
    gbk <- TRUE
  }
  tab <- FALSE
  if (type == "tab") {
    tab <- TRUE
  }
  prot <- FALSE
  if (type == "prot") {
    prot <- TRUE
  }
  tx <- FALSE
  if (type == "tx") {
    tx <- TRUE
  }
  genome <- FALSE
  if (type == "genome") {
    genome <- TRUE
  }
  if (is.null(type)) {
    gbk <- TRUE
    tab <- TRUE
    prot <- TRUE
    tx <- TRUE
    genome <- TRUE
  }
  if (length(type) > 1) {
    for (t in type) {
      download_microbesonline_files(id = id, type = t)
    }
  }

  prelude_url <- glue("http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id}")
  result <- xml2::read_html(prelude_url)
  titles <- rvest::html_nodes(result, "title")
  species <- (titles %>% rvest::html_text())[1]

  if (isTRUE(gbk)) {
    gbk_url <- glue("http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id};export=gbk")
    gbk_file <- glue("{id}.gbk")
    message("The species being downloaded is: ", species,
            " and is being downloaded as ", gbk_file, ".")
    gbk_downloaded <- download.file(gbk_url, gbk_file, quiet = TRUE)
    retlist[["gbk"]] <- gbk_file
  }

  if (isTRUE(tab)) {
    tab_url <- glue("http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id};export=tab")
    tab_file <- glue("{id}.tab")
    message("The species being downloaded is: ", species,
            " and is being downloaded as ", tab_file, ".")
    tab_downloaded <- download.file(tab_url, tab_file, quiet = TRUE)
    retlist[["tab"]] <- tab_file
  }

  if (isTRUE(prot)) {
    prot_url <- glue(
        "http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id};export=proteomes")
    prot_file <- glue("{id}_proteome.fasta")
    message("The species being downloaded is: ", species,
            " and is being downloaded as ", prot_file, ".")
    prot_downloaded <- download.file(prot_url, prot_file, quiet = TRUE)
    retlist[["prot"]] <- prot_file
  }

  if (isTRUE(tx)) {
    tx_url <- glue(
        "http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id};export=transcriptomes")
    tx_file <- glue("{id}_tx.fasta")
    message("The species being downloaded is: ", species,
            " and is being downloaded as ", tx_file, ".")
    tx_downloaded <- download.file(tx_url, tx_file, quiet = TRUE)
    retlist[["tx"]] <- tx_file
  }

  if (isTRUE(genome)) {
    genome_url <- glue(
        "http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id};export=genomes")
    genome_file <- glue("{id}_genome.fasta")
    message("The species being downloaded is: ", species,
            " and is being downloaded as ", genome_file, ".")
    genome_downloaded <- download.file(genome_url, genome_file, quiet = TRUE)
    retlist[["genome"]] <- genome_file
  }
  return(retlist)
}

#' Extract microbesonline taxon IDs without having to click on the weird boxes
#' at the top of the website.
#'
#' This should simplify getting material from microbesonline.
#'
#' @param species String to search the set of microbesonline taxa.
#' @return NULL or 1 or more taxon ids.
#' @seealso [xml2]
#' @examples
#'  coli_taxids <- get_microbesonline_taxid(species = "coli S88")
#'  head(coli_taxids)
#' @export
get_microbesonline_taxid <- function(species = "Acyrthosiphon pisum virus") {
  id_url <- "http://microbesonline.org/cgi-bin/fetchGenome2.cgi?taxId=g1&byFavorites=1"
  result <- xml2::read_html(id_url)
  id_nodes <- result %>%
    rvest::html_nodes("#GenomeList")
  . <- id_nodes  ## Shush, R CMD check
  id_links <- id_nodes %>%
    rvest::html_nodes("td:nth-child(1) a") %>%
    rvest::html_attr("href") %>%
    gsub(pattern = "^.*?tId=([[:digit:]]+)$", replacement = "\\1", x=.)
  id_table <- id_nodes %>%
    rvest::html_table(header = TRUE, fill = TRUE)
  id_df <- id_table[[1]]  ## Grab the first (only) element
  id_df[["tax_id"]] <- id_links
  foundp <- grep(pattern = species, x = id_df[["Genome"]])
  ret <- NULL
  if (length(foundp) == 0) {
    message("Did not find any entries with species: ", species)
  } else if (length(foundp) == 1) {
    result <- id_df[foundp, ]
    message("Found 1 entry.")
    message(result)
    ret <- result[["tax_id"]]
  } else {
    result <- id_df[foundp, ]
    message("Found multiple entries.")
    message(result)
    ret <- result[["tax_id"]]
    names(ret) <- result[["Genome"]]
  }
  return(ret)
}

#' Skip the db and download all the text annotations for a given species.
#'
#' The microbesonline publicly available mysqldb is rather more complex than I
#' prefer.  This skips that process and just grabs a tsv copy of everything and
#' loads it into a dataframe.  I have not yet figured out how to so-easily query
#' microbesonline for species IDs, thus one will have to manually query the
#' database to find species of interest.
#'
#' Tested in test_70expt_spyogenes.R
#' There is so much awesome information in microbesonline, but damn is it
#' annoying to download. This function makes that rather easier, or so I hope at
#' least.
#'
#' @param species Microbesonline species.
#' @param id Microbesonline ID to query.
#' @return Dataframe containing the annotation information.
#' @seealso [rvest] [xml2] [readr]
#' @examples
#'  pa14_microbesonline_annot <- load_microbesonline_annotations(species = "PA14")
#'  colnames(pa14_microbesonline_annot)
#' @export
load_microbesonline_annotations <- function(species = NULL, id = NULL) {
  if (is.null(id) & is.null(species)) {
    stop("This needs either a species or taxon id.")
  } else if (is.null(id)) {
    id <- get_microbesonline_taxid(species)
    if (is.null(id)) {
      stop("The species name did not find a taxon id.")
    } else if (length(id) > 1) {
      warning("There are more than 1 taxon id matches, arbitrarily choosing the first.")
      id <- id[1]
    }
  }
  prelude_url <- paste0("http://microbesonline.org/cgi-bin/genomeInfo.cgi?tId=", id)
  result <- xml2::read_html(prelude_url)
  titles <- rvest::html_nodes(result, "title")
  species <- (titles %>% rvest::html_text())[1]
  message("The species being downloaded is: ", species)
  url <- glue::glue("http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId={id};export=tab")
  message("Downloading: ", url)
  data <- sm(readr::read_tsv(url))
  return(data)
}

#' Extract the set of GO categories by microbesonline locus
#'
#' The microbesonline is such a fantastic resource, it is a bit of a shame that
#' it is such a pain to query.
#'
#' Tested in test_42ann_microbes.R
#' I am not 100% certain that this is giving me the full correct set of gene
#' ontology accessions. At the very least, it does return a large number of
#' them, which is a start.
#'
#' @param species Microbesonline species.
#' @param id Which species to query.
#' @param table_df Pre-existing data frame of annotations containing GO stuff.
#' @param id_column This no longer uses MySQL, so which column from the html
#'  table to pull?
#' @param data_column Similar to above, there are lots of places from which one might
#'  extract the data.
#' @param name Allowing for non-specific searches by species name.
#' @return data frame of GO terms from www.microbesonline.org
#' @seealso [tidyr]
#' @examples
#'  pa14_microbesonline_go <- load_microbesonline_go(species = "PA14")
#'  head(pa14_microbesonline_go)
#' @export
load_microbesonline_go <- function(id = NULL, species = NULL, table_df = NULL, id_column = "name",
                                   data_column = "GO", name = NULL) {
  if (is.null(id) & is.null(species)) {
    stop("This needs either a species or taxon id.")
  } else if (is.null(id)) {
    id <- get_microbesonline_taxid(species)
    if (is.null(id)) {
      stop("The species name did not find a taxon id.")
    } else if (length(id) > 1) {
      warning("There are more than 1 taxon id matches, arbitrarily choosing the first.")
      id <- id[1]
    }
  }
  chosen <- id
  if (is.null(table_df)) {
    table <- download_microbesonline_files(id = id, type = "tab")
    table_df <- sm(readr::read_tsv(file = table[["tab"]]))
  }
  if (! id_column %in% colnames(table_df)) {
    message(id_column, " was not found in the table, here are the available columns: ",
            toString(colnames(table_df)))
    ## message(head(as.data.frame(table_df), n = 2))
    stop()
  }
  ## Addressing a warning from tidyr, I am not quite certain what it was telling me TBH.
  go_df <- table_df[, c(id_column, data_column)] %>%
    tidyr::separate_rows(tidyr::all_of(data_column), sep = ",")
  keep_idx <- go_df[[data_column]] != ""
  go_df <- go_df[keep_idx, ]
  return(go_df)
}

## EOF