#' @title Pre-Process SPARQL Results Data
#'
#' @description The function takes data frame generated by one of the functions
#' that sourced data from \href{http://statistics.gov.scot}{statistics.gov.scot}
#' and returns pre-processed data set with the following changes.
#'
#' @param x A data frame, usually obtained via
#' \code{\link[SmarterScotland]{get_geography_data}} or other data sourcing
#' function.
#' @param clean_URI_strings Defaults to \code{TRUE}; removes initial part of
#' URI \code{ex. http://purl.org/linked-data/cube#} from an URI string.
#' @param remove_cols Removes redundant columns, such as columns with value
#' \code{URI} only. Defaults to \code{TRUE}.
#' @param clean_column_names Defaults to \code{TRUE} applies sensible name
#' cleaning to provided columns. For instance, column
#' \code{unit_of_measure.value} will become \code{unit_of_measure}.
#'
#' @return A data frame.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pre_process_data(x = get_geography_data(data_set = "recorded-crime",
#' geography = "Glasgow City",
#' measure = "count"))
#' }
pre_process_data <-
function(x,
clean_URI_strings = TRUE,
remove_cols = TRUE,
clean_column_names = TRUE) {
# Check if provided object is data frame
assert_data_frame(
x = x,
all.missing = FALSE,
min.rows = 1,
min.cols = 1,
null.ok = FALSE
)
# Keep only last elemnt of URI
if (clean_URI_strings) {
x[] <- lapply(
X = x,
FUN = function(x) {
sub(
pattern = "^h.*(/|.*#)",
replacement = "",
x = x,
perl = TRUE
)
}
)
}
# Handy
not.all <- Negate(all)
# Remove pointless columns
if (remove_cols) {
x <- Filter(
f = function(x) {
# Filter against undesired values
not.all(x %in% c("uri", "integer", "literal", "Observation"))
},
x
)
}
if (clean_column_names) {
x <- setNames(
object = x,
nm = gsub(
pattern = "\\.value",
replacement = "",
x = names(x)
)
)
}
# Fix column types
potential_numeric_columns <- which(vapply(
FUN = function(column) {
any(!grepl(
pattern = "[^\\d\\,\\.\\-]",
x = column,
perl = TRUE
))
},
X = x,
FUN.VALUE = logical(length = 1)
))
if (not.all(is.na(potential_numeric_columns))) {
x[, potential_numeric_columns] <- lapply(
X = x[, potential_numeric_columns],
FUN = function(column) {
column <-
gsub(
pattern = ",",
replacement = "",
x = column,
fixed = TRUE
)
if (any(grepl(pattern = "\\.", x = column))) {
num_col <- suppressWarnings(as.numeric(column))
} else {
num_col <- suppressWarnings(as.integer(column))
}
if (any(is.na(num_col))) {
column
} else {
num_col
}
}
)
}
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.