R/parse_doc.R

Defines functions parse_doc

Documented in parse_doc

#' Read and parse doc of PubMed records and extract specified datatypes as csv's
#'
#' @param doc Filepath of doc of batch of unparsed PubMed records, such as the output of \code{\link{fetch_batch}}.
#' @param pmids Vector of pmids. If pmids not user-provided, pmids will be saved as .rds.
#' @param datatypes Types of data to extract from xml for which there is a corresponding "pubmed_" function ("table", "abstract", "databanks", "authors", "mesh", "keywords", "pubtypes")
#' @param file_name Root for file names. Defaults to "pubmed".
#' @param suffix Suffix for file names. For example, record numbers. Defaults to NULL.
#' @param dir Directory for saving files (log file and pmids.rds, and extracted csv's, depending on \code{subdir}). Defaults to project root (\code{here::here()})
#' @param subdir Directory for saving extracted csv's. Defaults to \code{dir}.
#' @param quiet Whether to silence messages in console. Defaults to FALSE.
#' @param return Whether to return parsed xml. Defaults to TRUE. Set to FALSE if interested in only side-effect csv's.
#'
#'@param file_name and @param suffix If both are equal to default, \code{doc} is checked against filename patterns of files generated by \code{\link{fetch_batch}} (either "YYYY-MM-DD_{FILENAME}_{SUFFIX}.txt" or "YYYY-MM-DD_{FILENAME}.txt"). Available values are extracted.
#'
#' @return Parsed xml with names = pmids. Also, side-effect of specified datatypes as csv's.
#' @export

parse_doc <- function(doc,
                      pmids = NULL,
                      datatypes = c("table", "abstract",
                                    "databanks", "authors", "mesh",
                                    "keywords", "pubtypes"),
                      file_name = "pubmed",
                      suffix = NULL,
                      dir = here::here(),
                      subdir = dir,
                      quiet = FALSE,
                      return = TRUE){

  # If default file_name and suffix, check for pattern matches
  if (file_name == "pubmed" & is.null(suffix)){

    # If YYYY-MM-DD_{FILENAME}_{SUFFIX}.txt
    if (stringr::str_detect(doc, "\\d{4}-\\d{2}-\\d{2}_[^_]+_[^_]+\\.txt$")){

      file <- stringr::str_extract(doc, "\\d{4}-\\d{2}-\\d{2}_[^_]+_[^_]+\\.txt$")
      file_name <- stringr::str_extract(doc, "(?<=\\d{4}-\\d{2}-\\d{2}_)[^_]+(?=_)")
      suffix <- stringr::str_extract(doc, "(?<=_)[^_]+(?=\\.txt$)")

      # If YYYY-MM-DD_{FILENAME}.txt
    } else if (stringr::str_detect(doc, "\\d{4}-\\d{2}-\\d{2}_[^_]+\\.txt$")) {

      file <- stringr::str_extract(doc, "\\d{4}-\\d{2}-\\d{2}_[^_]+\\.txt$")
      file_name <- stringr::str_extract(doc, "(?<=_)[^_]+(?=\\.txt$)")

    } else {

      file <- stringr::str_extract(doc, "(?<=/)[^/]+\\.txt$")

    }
  }

  # Inform user of file_name and suffix
  if (!quiet){
    rlang::inform(message = paste("Parsing", file,
                                  "\nfile_name:", file_name,
                                  "\nsuffix:", suffix)
    )
  }

  doc <- readr::read_file(doc)

  pubmedparser::parse_batch(doc,
                            pmids = pmids,
                            datatypes = datatypes,
                            file_name = file_name,
                            suffix = suffix,
                            dir = dir,
                            subdir = subdir,
                            quiet = quiet,
                            return = return)

  # TODO: fix NULL return, currently returns input (doc/filepath)
  # if (return) return(batch) else return(NULL)
}
maia-sh/pubmedparser documentation built on Feb. 18, 2021, 11:44 a.m.