R/cfm-read.R

Defines functions cfm_read_batch cfm_predict_readfile cfm_predict_stdout

Documented in cfm_predict_readfile cfm_predict_stdout cfm_read_batch

#' Read cfm-predict standard output into R object
#'
#' @param rst Standard out from a completed system2 call to cfm-predict
#'
#' @return List containing predicted spectra (and annotations, if present)
#' @export
#'
cfm_predict_stdout <- function(rst){
  rst_break <- which(sapply(rst, nchar) == 0)
  spec <- rst[1:(rst_break-1)]
  spec_f <- cumsum(as.numeric(grepl('energy', spec)))
  spec_breaks <- !grepl('energy', spec) & spec != ""
  spec_split <- stats::setNames(
    split(spec[spec_breaks], spec_f[spec_breaks]),
    spec[grepl('energy', spec)])

  spec_out <- lapply(spec_split, function(x){
    do.call('rbind', lapply(x, function(xx) {
      y <- strsplit(xx, ' ')
      data.frame(matrix(
        c(y[[1]][1:2]),
        ncol = 2,
        byrow = T,
        dimnames = list(c(), c('mz', 'int'))
      ),
      stringsAsFactors = F)
    }))
  })

  if (rst_break == length(rst)) {
    out <- spec_out
  } else {
    spec_ann <- lapply(spec_split, function(x) {
      sapply(strsplit(x, ' '), function(xx) {
        paste0(xx[3:length(xx)], collapse = ' ')
      })
    })
    annotation <-
      data.frame(matrix(do.call('rbind', strsplit(rst[(rst_break + 1):length(rst)], ' ')),
                        ncol = 3,
                        dimnames = list(c(), c('idx', 'mz', 'smiles'))), stringsAsFactors = F)
    out <-
      list(
        spec = spec_out,
        spec_ann = spec_ann,
        annotation = annotation
      )
  }
  return(out)
}

#' Read cfm_predict results from output_filename
#'
#' @param output_filename A text file containing the output from cfm_predict.
#'
#' @return List containing predicted spectra (and annotations, if present)
#' @export
#'
cfm_predict_readfile <- function(output_filename) {
  rst <-
    utils::read.delim(
      file = output_filename,
      header = F,
      check.names = F,
      stringsAsFactors = F
    )[, 1]
  rst <- append(rst, "", after = which(grepl('^0', rst)) - 1)
  cfm_predict_stdout(rst = rst)
}


#' Read results of batch CFM prediction
#'
#' @param out_dir A path to a directory containing text files generated by
#'   \code{cfm_predict_batch}
#' @param table_name A name to be used for the resultant database table
#' @param output_path A path to a SQLite database file for writing the results
#'
#' @return Writes table to sql lite database file designated by output_path
#' @export
#'
cfm_read_batch <-
  function(out_dir = NULL,
           table_name = NULL,
           output_path = NULL) {
    stopifnot(exprs = {
      !is.null(out_dir)
      !is.null(table_name)
      !is.null(output_path)
    })
    tmp <-
      dplyr::tibble(filename = fs::dir_ls(out_dir, glob = '*.txt')) %>%
      dplyr::mutate(ID = gsub(".txt", "", x = basename(filename))) %>%
      dplyr::rowwise(.) %>%
      dplyr::mutate(spectrum = blob::as_blob(readBin(filename, 'raw', 1e6))) %>%
      dplyr::ungroup(.)
    if (!is.null(output_path)) {
      conn <- DBI::dbConnect(RSQLite::SQLite(), output_path)
      on.exit(DBI::dbDisconnect(conn))
      DBI::dbWriteTable(conn, table_name, tmp)
    } else{
      return(tmp)
    }
  }

#' Read CFM precomputed spectra from a database created by cfm_read_batch
#'
#' @param db_file [string] Full path to the database file
#' @param table_name [string] Name of table containing the CFM spectra to read
#' @param ID [string] A vector of ID(s) to retrieve from the database
#' @param return_annotation [logical] Should spectra be returned with annotations
#'
#' @return A named list of spectra when selected table contains a column named
#'   'spectrum'. Otherwise a tibble.
#' @export
#'
cfm_read_db <-
  function(db_file = NULL,
           table_name = NULL,
           ID = NULL,
           return_annotation = F) {
    db_file <- normalizePath(db_file, mustWork = T)
    stopifnot(!is.null(table_name))
    conn <- DBI::dbConnect(RSQLite::SQLite(), db_file)
    on.exit(DBI::dbDisconnect(conn))
    stopifnot(table_name %in% DBI::dbListTables(conn))
    if (!is.null(ID)) {
      id_match <- dplyr::tbl(conn, table_name) %>%
          dplyr::filter(ID %in% !!ID)

      if(dplyr::count(id_match) %>% dplyr::pull() == 0){
        cat("No predicted spectra found for ID(s):\n", ID)
        return(NULL)
      }
      a <- dplyr::as_tibble(id_match)
    } else {
      a <- dplyr::tbl(conn, table_name) %>%
        dplyr::as_tibble()
    }

    if ('spectrum' %in% colnames(a)) {
      b <- purrr::map(
        a$spectrum,
        cfm_parse_spec,
        return_annotation = return_annotation
        )
      out <- rlang::set_names(b, a$ID)
    } else {
      out <- a
    }
    return(out)
  }
gjgetzinger/cfmR documentation built on May 11, 2020, 1 p.m.