R/pre_process_data.R

Defines functions pre_process_data

Documented in pre_process_data

#' @title Pre-Process SPARQL Results Data
#'
#' @description The function takes data frame generated by one of the functions
#'   that sourced data from \href{http://statistics.gov.scot}{statistics.gov.scot}
#'   and returns pre-processed data set with the following changes.
#'
#' @param x A data frame, usually obtained via
#'   \code{\link[SmarterScotland]{get_geography_data}} or other data sourcing
#'   function.
#' @param clean_URI_strings Defaults to \code{TRUE}; removes initial part of
#'   URI \code{ex. http://purl.org/linked-data/cube#} from an URI string.
#' @param remove_cols Removes redundant columns, such as columns with value
#'   \code{URI} only. Defaults to \code{TRUE}.
#' @param clean_column_names Defaults to \code{TRUE} applies sensible name
#'   cleaning to provided columns. For instance, column
#'   \code{unit_of_measure.value} will become \code{unit_of_measure}.
#'
#' @return A data frame.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pre_process_data(x = get_geography_data(data_set = "recorded-crime",
#'                                         geography = "Glasgow City",
#'                                         measure = "count"))
#' }
pre_process_data <-
  function(x,
           clean_URI_strings = TRUE,
           remove_cols = TRUE,
           clean_column_names = TRUE) {
    # Check if provided object is data frame
    assert_data_frame(
      x = x,
      all.missing = FALSE,
      min.rows = 1,
      min.cols = 1,
      null.ok = FALSE
    )

    # Keep only last elemnt of URI
    if (clean_URI_strings) {
      x[] <- lapply(
        X = x,
        FUN = function(x) {
          sub(
            pattern = "^h.*(/|.*#)",
            replacement = "",
            x = x,
            perl = TRUE
          )
        }
      )
    }

    # Handy
    not.all <- Negate(all)

    # Remove pointless columns
    if (remove_cols) {
      x <- Filter(
        f = function(x) {
          # Filter against undesired values
          not.all(x %in% c("uri", "integer", "literal", "Observation"))
        },
        x
      )
    }

    if (clean_column_names) {
      x <- setNames(
        object = x,
        nm = gsub(
          pattern = "\\.value",
          replacement = "",
          x = names(x)
        )
      )
    }

    # Fix column types
    potential_numeric_columns <- which(vapply(
      FUN = function(column) {
        any(!grepl(
          pattern = "[^\\d\\,\\.\\-]",
          x = column,
          perl = TRUE
        ))
      },
      X = x,
      FUN.VALUE = logical(length = 1)
    ))

    if (not.all(is.na(potential_numeric_columns))) {
      x[, potential_numeric_columns] <- lapply(
        X = x[, potential_numeric_columns],
        FUN = function(column) {
          column <-
            gsub(
              pattern = ",",
              replacement = "",
              x = column,
              fixed = TRUE
            )

          if (any(grepl(pattern = "\\.", x = column))) {
            num_col <- suppressWarnings(as.numeric(column))
          } else {
            num_col <- suppressWarnings(as.integer(column))
          }

          if (any(is.na(num_col))) {
            column
          } else {
            num_col
          }

        }
      )
    }
    return(x)
  }
konradedgar/SmarterScotland documentation built on Oct. 3, 2019, 11:46 a.m.