R/parse_batches.R

Defines functions parse_batches

Documented in parse_batches

#' Read and parse doc of PubMed records and extract specified datatypes as csv's
#'
#' @param input_dir Filepath of doc of batch of unparsed PubMed records, such as the output of \code{\link{fetch_batch}}.
#' @param pmids Vector of pmids. If pmids not user-provided, pmids will be saved as .rds.
#' @param datatypes Types of data to extract from xml for which there is a corresponding "pubmed_" function ("table", "abstract", "databanks", "authors", "mesh", "keywords", "pubtypes")
#' @param file_name Root for file names. Defaults to "pubmed".
#' @param suffix Suffix for file names. For example, record numbers. Defaults to NULL.
#' @param dir Directory for saving files (log file and pmids.rds, and extracted csv's, depending on \code{subdir}). Defaults to project root (\code{here::here()})
#' @param subdir Directory for saving extracted csv's. Defaults to \code{dir}.
#' @param quiet Whether to silence messages in console. Defaults to FALSE.
#' @param return Whether to return parsed xml. Defaults to FALSE since complete batches may be too large to hold in memory and interested in only side-effect csv's. If TRUE, returns list of length number of files in \code{input_dir} with each element containing a parsed xml.
#'
#'@param file_name and @param suffix If both are equal to default, \code{doc} is checked against filename patterns of files generated by \code{\link{fetch_batch}} (either "YYYY-MM-DD_{FILENAME}_{SUFFIX}.txt" or "YYYY-MM-DD_{FILENAME}.txt"). Available values are extracted.
#'
#' @return Parsed xml with names = pmids. Also, side-effect of specified datatypes as csv's.
#' @export

# default return is FALSE

parse_batches <- function(input_dir,
                          pmids = NULL,
                          datatypes = c("table", "abstract",
                                        "databanks", "authors", "mesh",
                                        "keywords", "pubtypes"),
                          file_name = "pubmed",
                          suffix = NULL,
                          dir = here::here(),
                          subdir = dir,
                          quiet = FALSE,
                          return = FALSE){

  # Check whether directory exists
  if (!fs::dir_exists(input_dir)) rlang::abort("Directory does not exist.")

  # Read in files from directory
  docs <- fs::dir_ls(input_dir)

  # Check whether only .txt files
  if (!all(fs::path_ext(docs) == "txt")) rlang::abort("Directory must contain only .txt files.")

  # Walk if not returning parsed xml and map if returning parsed xml
  if (!return) {

    purrr::walk(docs,
                parse_doc,
                pmids = pmids,
                datatypes = datatypes,
                file_name = file_name,
                suffix = suffix,
                dir = dir,
                subdir = subdir,
                quiet = quiet,
                return = return)

  } else if (return) {

    purrr::map(docs,
               parse_doc,
               pmids = pmids,
               datatypes = datatypes,
               file_name = file_name,
               suffix = suffix,
               dir = dir,
               subdir = subdir,
               quiet = quiet,
               return = return)
  }
}
maia-sh/pubmedparser documentation built on Feb. 18, 2021, 11:44 a.m.