R/augmentations.R

Defines functions parse_ranges jst_unify_journal_id jst_get_total_pages jst_add_total_pages jst_clean_page jst_augment

Documented in jst_add_total_pages jst_augment jst_clean_page jst_get_total_pages jst_unify_journal_id

#' Clean data from DfR
#' 
#' This function takes data from [jst_get_article()] and
#' applies helper functions for cleaning the data.
#' 
#' Data from DfR is inherently messy. For many examples see 
#' `vignette("known-quirks", package = "jstor")`. `jst_augment()` is a
#' convenience function that tries to deal with a few common tasks to
#' clean the data.
#' 
#' For journal articles, it calls [jst_clean_page()] to convert first and last
#' page, [jst_unify_journal_id()] and [jst_add_total_pages()].
#' 
#' @param meta_data Data which was processed via [jst_get_article()].
#' @param quietly Should warnings from converting page ranges be suppressed?
#' 
#' @return A cleaned tibble.
#' @seealso [jst_clean_page()] [jst_unify_journal_id()] [jst_add_total_pages()]
#' [jst_get_total_pages()]
#' 
#' @export
jst_augment <- function(meta_data, quietly = FALSE) {
  col_names <- names(meta_data)
  
  if (identical(col_names, names(article_cols$cols)) ||
      identical(col_names, names(article_cols_old$cols))) {
    # for journal articles
    meta_data %>%
      dplyr::mutate_at(dplyr::vars("first_page", "last_page"),
                       jst_clean_page) %>% 
      jst_unify_journal_id() %>%
      jst_add_total_pages(quietly = quietly)
  } else {
    abort("Unknown meta_data type.")
  }
}

#' Clean a character vector of pages
#' 
#' This function tries to convert character vectors into integers. This function
#' should not be called on page ranges.
#' 
#' @param page A character vector for pages.
#' @return An integer vector, cleaned and converted from the input vector.
#' 
#' 
#' @export
#' @examples 
#' jst_clean_page("2")
#' 
#' # anything that is not a digit gets removed
#' jst_clean_page("A2-")
#' 
#' # a weird format from the American Journal of Sociology is convered correctly
#' jst_clean_page("AJSv104p126")
#' # this is done by searching for "p", and if it is found, extracting the
#' # content after "p".
jst_clean_page <- function(page) {
  # check if any has weird format like with AJS: AJSv104p126
  is_complicated <- str_detect(page, "p")
  
  # do not change missing values
  is_complicated[is.na(is_complicated)] <- FALSE
  
  page[is_complicated] <- stringr::str_extract(page[is_complicated], 
                                               "(?<=p)\\d+")
  
  convert_page(page)
}

#' Add total count of pages
#' 
#' This function adds a column with the total count of pages. It calls 
#' [jst_get_total_pages()] which does the main work.
#' 
#' @param meta_data Data which was processed via [jst_get_article()].
#' @param quietly Should warnings from converting page ranges be suppressed?
#' 
#' @return A `tibble`, as provided with in `meta_data`, with an additional 
#' column on total number of pages.
#' 
#' @export
#' @seealso [jst_get_total_pages()]
jst_add_total_pages <- function(meta_data,
                                quietly = FALSE) {

  dplyr::mutate(
    meta_data,
    n_pages = jst_get_total_pages(first_page, last_page, page_range,
                                      quietly)
  )
}

#' Calculate total pages 
#' 
#' This function is a simple helper to calculate the total number of pages of
#' an article.
#' 
#' This function deals with four cases:
#' 
#' - if all three arguments are missing, NA is returned.
#' - if page_range is supplied, the number of pages is calculated from it.
#' - if only the first page is supplied, NA is returned.
#' - if first and last page are supplied, the number of pages is calculated as
#' `last_page - first_page + 1`.
#' 
#' The algorithm to parse page ranges works as follows: A typical page range is
#' `1-10, 200` where the article starts at page 1, ends at page 10, and has an
#' erratum at page 200. For this case, the range is calculated as
#' `range + single_page`, as in`(10 - 1 + 1) + 1 = 11`. Sometimes multiple 
#' ranges are given: `1-10, 11-20`. For those cases all ranges are summed:
#' `(10 - 1 + 1) + (20 - 11 + 1) = 20`. Another specification for multiple 
#' ranges is `1-10+11-20`, which is treated similarly.
#' 
#' 
#' @param first_page The first page of an article (numeric).
#' @param last_page The last page of an article (numeric).
#' @param page_range The page range of an article (character).
#' @param quietly Sometimes page ranges contain roman numerals like `xiv`. These
#' are not recognized, return `NA` and raise a warning. If set to `TRUE`, this
#' warning not raised.
#' 
#' @return A vector with the calculated total pages.
#' 
#' @export
#' @examples 
#' # calculate pages from first and last page
#' first_pages <- sample(30:50, 10)
#' last_pages <- first_pages + sample(5:20, 10)
#' page_ranges <- rep(NA_character_, 10)
#' 
#' jst_get_total_pages(first_pages, last_pages, page_ranges)
#'
#' # get pages from page range
#' jst_get_total_pages(NA_real_, NA_real_, "51 - 70")
#' jst_get_total_pages(NA_real_, NA_real_, "51 - 70, 350")
#' jst_get_total_pages(NA_real_, NA_real_, "350, 51 - 70")
#' jst_get_total_pages(NA_real_, NA_real_, "51 - 70, 80-100")
#' jst_get_total_pages(NA_real_, NA_real_, "51-70+350")
jst_get_total_pages <- function(first_page, last_page, page_range,
                                quietly = FALSE) {
  if (!(is.numeric(first_page) && is.numeric(last_page))) 
    abort("`first_page` and `last_page` must be numeric.")
  
  if (!is.character(page_range))
    abort("`page_range` must be a character vector.")
  
  ll <- list(first_page, last_page, page_range)
  if (!identical(length(unique(lengths(ll))), 1L)) {
    abort("All inputs must have the same length.")
  }
  
  dplyr::case_when(
    is.na(first_page) & is.na(last_page) & is.na(page_range) ~ NA_real_,
    !is.na(page_range) ~ parse_ranges(page_range, quietly),
    is.na(page_range) & !is.na(first_page) & is.na(last_page) ~ NA_real_,
    is.na(page_range) & 
      !is.na(first_page) & 
        !is.na(last_page) ~ as.numeric(last_page) - as.numeric(first_page) + 1
  )
}




#' Unify journal IDs
#' 
#' This function is a simple wrapper to unify journal ids.
#' 
#' Date on journal ids can be found in three columns:
#' `journal_pub_id`, `journal_jcode` and `journal_doi`. From my experience,
#' most of the time the relevant dat ais present in `journal_pub_id` or
#' `journal_jcode`, with `journal_jcode` being to most common identifier.
#' This function takes the value from `journal_pub_id`, and if it is missing,
#' that from `journal_jcode`. `journal_doi` is currently disregarded.
#' 
#' @param meta_data Data which was processed via [jst_get_article()].
#' @param remove_cols Should the original columns be removed after unifying?
#' 
#' @return A modified `tibble`.
#' 
#' @return A modified tibble.
#' @export
#' 
#' @examples 
#' article <- jst_get_article(jst_example("article_with_references.xml"))
#' 
#' jst_unify_journal_id(article)
#' 
#' 
#' # per default, original columns with data on the journal are removed
#' library(dplyr)
#' 
#' jst_unify_journal_id(article) %>% 
#'   select(contains("journal")) %>% 
#'   names()
#'   
#' # you can keep them by setting `remove_cols` to `FALSE`
#' jst_unify_journal_id(article, remove_cols = FALSE) %>%  
#'   select(contains("journal")) %>%
#'   names()
jst_unify_journal_id <- function(meta_data,
                                 remove_cols = TRUE) {
  out <- meta_data %>%
    dplyr::mutate(journal_id = dplyr::case_when(
      is.na(journal_pub_id) ~ journal_jcode,
      TRUE ~ journal_pub_id
    ))
  
  if (remove_cols) {
    out <- dplyr::select(out, -journal_pub_id, -journal_jcode, -journal_doi)
  } 
  
  out
}








parse_ranges <- function(page_range, quietly = FALSE) {
  nrow <- validate_tibble(page_range)
  
  splitted_df <- tibble::new_tibble(list(page_range = page_range), 
                                    nrow = nrow) %>% 
    mutate(id = 1:n(),
           splitted = stringr::str_split(page_range, ",|\\+"))
  
  if (tidyr_new_interface()) {
    splitted_df <- splitted_df %>% 
      tidyr::unnest(cols = c("splitted"))
  } else {
    splitted_df <- splitted_df %>% 
      tidyr::unnest()
  }
  
  # detect roman numerals which are occasionally used for introduction sections
  roman_chars <- str_detect(splitted_df$splitted, "x|i|v|X|I|V")
  
  if (any(roman_chars, na.rm = TRUE) && !quietly) {
    warning("Cannot handle roman numerals (`x|i|v|X|I|V`) in rows (",
            paste(which(roman_chars), collapse = ", "),
            ") when computing ",
            "page range. ",
            "Returning `NA_character` instead.\n", call. = FALSE)
  }
  
  splitted_df %>% 
    mutate(wo_space = stringr::str_remove(splitted, "\\s"),
           wo_chars = stringr::str_remove_all(wo_space, "[:alpha:]"),
           first = stringr::str_extract(wo_chars, "^\\d+") %>% as.integer(),
           last = stringr::str_extract(wo_chars, "\\d+$") %>% as.integer(),
           total = last - first + 1) %>% 
    dplyr::group_by(id) %>% 
    dplyr::summarise(sum = sum(total)) %>% 
    dplyr::pull(sum)
}
tklebel/jstor documentation built on July 31, 2023, 1:35 p.m.