# Source file: start.R
#
# MIT License
#
# Copyright (c) 2022 Victor Ordu
## Functions for collecting and cleaning up data
globalVariables(c("orgtype", "orgname"))
#' Import Raw Data Into the Project
#'
#' @param dir A directory used for the downloads, which usually has
#' subdirectories - one for each project State.
#' @param modlist An optional list of objects of class \code{VarModifier}.
#' @param state The project state.
#' @param filetype The kind of data being imported. Should be one of
#' \emph{Capacity} or \emph{Services} for data on capacity assessment and for
#' GBV service mapping, respectively.
#' @param drop.c,drop.v Character or integer vector of columns or variables to
#' be dropped from the data table or list of variable names, respectively.
#' @param nvars The number of columns in the final data frame.
#' @param ... Arguments passed to \code{read_in_excel_data}.
#' @param na.strings Strings in the Excel sheet to be interpreted as \code{MA}.
#' @param db Path to an SQLite database. If non-existent, will attempt to
#' create one. If \code{NULL}, the data are returned without being saved.
#'
#' @return Both functions return a \code{data.frame} with labelled variables.
#' \code{import_data} does so invisibly.
#'
#' @note \code{import_data} wraps \code{read_in_excel_data}, as it carries
#' out additional processing of the data read from the spreadsheets.
#' \code{nvars} is usually set for the capacity assessment data, which tend
#' to have superfluous columns.
#'
#' @importFrom RSQLite dbConnect
#' @importFrom RSQLite dbDisconnect
#' @importFrom dplyr filter
#' @importFrom dplyr %>%
#'
#' @export
import_data <-
function(dir, modlist = NULL, state, filetype, ..., na.strings, db = NULL)
{
# Validate input
if (!is.null(modlist)) {
if (!is.vector(modlist, mode = "list"))
stop("'modlist' must be a vector of type 'list'")
}
state <- match.arg(state, getOption("jgbv.project.states"))
srv <- "Services"; cap <- "Capacity"
filetype <- match.arg(filetype, c(srv, cap))
if (missing(na.strings))
na.strings <- ""
## Collect data as well as modified variable names
## The vector with the modified names is supplied as
## a named vector via `options`, and the vector is named
## to make for easy indexing. The vector used varies depending
## on whether we are working with data on service mapping or
## for capacity assessment.
dat <- read_in_excel_data(dir, state, filetype, ..., na.strings = na.strings)
newvars <- .chooseNewVars(filetype)
## Make sure that only values from the state of interest
## are used. This is made necessary by the experience we
## had where data from 2 States were inadvertently joined
## together in a single Excel sheet. We don't want to have
## check this manually again. This applies only to the
## capacity assessment data and for the NFWP project!!!
if (filetype == cap && getOption("jgbv.project.name") == "NFWP") {
lgavarname <- paste0("lga.", tolower(state))
lgavar <- newvars[lgavarname]
dat <- dplyr::filter(dat, !is.na(.data[[lgavar]]))
}
# Fine-tune the variable labels
# First do the generic one that works the same regardless of the
# type of data we're dealing with, and then the one specific
# to each type of dataset.
var_label(dat) <- dat %>%
var_label(unlist = TRUE) %>%
unname %>%
.cleanLblGeneral() %>%
{
if (filetype == cap)
.cleanLblCapacity(.)
else if (filetype == srv)
.cleanLblServices(.)
}
## This block is only run for the data for service mapping
## Essentially, we are cleaning up selected columns.
## TODO: This operation may not be wholly necessary at this
## stage since we are saving to a database, which does not
## preserve all the base R types/classes used here.
if (filetype == srv) {
if (!is.null(modlist)) {
for (x in modlist)
dat <- dat %>%
.modifyAndPreserveLabels(newvars[x$vars], x$func, x$nestfunc, x$args)
}
dat <- transform_bool_to_logical(dat)
# In some of the State datasets, the number of staff/beneficiaries and costs
# are represented as character vectors, so we perform a check/fix for this.
for (col in grep("num_|_num|fee_", newvars, value = T)) {
if (is.character(dat[[col]]))
try(dat <- .modifyAndPreserveLabels(dat, col, as.numeric))
}
## Conversely, some of the columns that ought to contain strings are
## found to be numeric in some States' datasets
for (col in grep("describe|simserial", newvars, value = TRUE))
try(dat <- .modifyAndPreserveLabels(dat, col, as.character))
dat <- dat |>
.modifyAcrossPatterns("^(yes|no|sometimes|never|always)$", str_to_title) |>
.modifyAcrossPatterns("^free$", str_to_title) |>
fix_factors(newvars)
## Some of the LGAs in States are misspelt. We have prepared a function
## that is customized to fix this given the spelling mistakes identified
## and varies depending on the State
dat[[newvars['lga']]] <-
repair_lga_spelling(dat[[newvars['lga']]], state)
}
else if (filetype == cap) {
## In some cases, there are missing values where the type of
## organization ought to be specified. In this instance, we try
## to deduce what kind of organization it is by examining the name
## of the organization, which is extracted with regular expressions.
## When found, the type is identified and filled in.
rgxes <- c(health = "(hf|health|hospital|phc|dispensary)\\s?",
law = "law|nscdc")
is.missing <- sapply(rgxes, function(rgx) {
dat %>%
with(is.na(orgtype) &
grepl(rgx, orgname, ignore.case = TRUE)) %>%
which()
}, simplify = FALSE)
fac.string <-
sapply(
names(rgxes),
function(nm) {
unique(grep(nm, x = dat$orgtype, value = TRUE, ignore.case = TRUE))
},
USE.NAMES = TRUE
)
for (i in names(is.missing)) {
indic <- is.missing[[i]]
dat$orgtype[indic] <- fac.string[[i]]
}
}
if (!is.null(db))
save_table(dat, state, type = tolower(filetype), db)
invisible(dat)
}
.chooseNewVars <- function(filetype) {
stopifnot(is.character(filetype))
ft <- tolower(filetype)
if (ft == 'services')
return(new.varnames)
if(ft == 'capacity')
return(getOption('jgbv.capnames'))
stop("No avaiable action for file type ", sQuote(filetype))
}
#
# make_filename_regex <- function(services, capacity) {
# if (is.character(services) && is.character(capacity))
# return(c(Services = services, Capacity = capacity))
# stop("Both 'services' and 'capacity' must be character vectors")
# }
# The next set of modifications are based on the actual elements
# of the variable. They are identified with regular expressions.
# This function uses those patterns to carry out the modifications.
.modifyAcrossPatterns <- function(df, pattern, f) {
func <- substitute(f)
stopifnot({
is.data.frame(df)
is.character(pattern)
# is.function(eval(func))
})
patternFound <-
function(column) any(grepl(pattern, column))
indices <- which(vapply(df, patternFound, logical(1)))
if (!indices[1])
stop("The regular expression ",
sQuote(pattern),
"did not match column values")
for (i in indices)
try(df <- .modifyAndPreserveLabels(df, i, deparse(func)))
df
}
# Generic cleaning of variable labels
#' @importFrom stringr str_remove
#' @importFrom stringr str_replace
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_trim
#' @importFrom stringr str_squish
.cleanLblGeneral <- function(labels) {
stopifnot(is.character(labels))
labels %>%
str_remove("^_+") %>%
str_replace_all("\\s/\\s", "/") %>%
str_replace("\\s/\\s", "/") %>%
str_remove("\\.{3}\\d+$") %>%
str_trim %>%
str_squish
}
#' @importFrom stringr str_remove
#' @importFrom stringr str_remove_all
#' @importFrom stringr str_replace
#' @importFrom stringr str_trim
#' @importFrom stringr str_squish
.cleanLblCapacity <- function(labels) {
stopifnot(is.character(labels))
labels %>%
str_replace("^(.+)(GPS coordinates)(.+_)(.+)$", "\\2 (\\4)") %>%
str_remove(regex("(Please)? (specify|state)", ignore_case = TRUE)) %>%
str_remove("^\\d{1,2}\\)") %>%
str_remove_all("Have you participated in any training on the provision of") %>%
str_remove("Handling of GBV case\\(s\\)") %>%
str_trim %>%
str_remove("\\,$") %>%
str_replace("^e", "E") %>%
str_squish
}
#' @importFrom stringr str_remove
#' @importFrom stringr str_replace
#' @importFrom stringr str_trim
#' @importFrom stringr str_squish
.cleanLblServices <- function(labels) {
stopifnot(is.character(labels))
labels %>%
str_replace("lgaorigin", "LGA") %>%
str_replace("stateorigin", "State") %>%
str_replace("\\(select all that apply\\)", "") %>%
str_remove("^Sources of funding/") %>%
str_replace("\\((please|to)?\\s?describe\\)", " ") %>%
str_replace("(Other )(\\((to )?specify\\))", "\\1") %>%
str_replace("The qualification/capacity of the staff", "Staff qualification/capacity") %>%
str_replace("(Nothing, facility is w)(ell equipped)", "W\\2") %>%
str_remove("\\((please )?specify\\)") %>%
str_remove("^Opening days/") %>%
str_remove("^Which forms of GBV does this facility address\\?/") %>%
str_remove("^Is the facility.+survivors of GBV?/") %>%
str_remove("^Please.+provided by this organization:/") %>%
str_remove("^Please select the specific forms of GBV.+ protocols.+survivors, such as :/") %>%
str_remove("^What services.+provide to survivors.+at this facility\\?/") %>%
str_remove(
"^Is the facility missing anything that is necessary to provide quality care to survivors of GBV\\?/"
) %>%
str_remove(
regex(
"^Are the following MEDICINES AND ESSENTIAL SUPPLIES\\s*available\\?/",
igore_case = TRUE
)
) %>%
str_remove("^Are the following elements available\\?/") %>%
str_remove("\\(funds for school fees\\, food\\, etc\\.\\)") %>%
str_remove("\\(Please specify the length of time survivors are allowed to stay\\)") %>%
str_remove("\\(funds for school fees\\, food\\, etc\\.\\)") %>%
str_remove("\\(referral to other structures\\)") %>%
str_remove("^Refer to") %>%
str_remove("\\(including completion of medical certificates or other medico-legal forms\\)") %>%
str_remove("\\(i\\.e\\. first aid kits\\, feminine hygiene products\\)") %>%
str_remove("\\(survivor-centered approaches and trauma-informed interview skills\\)") %>%
str_remove("\\(i\\.e\\. private or hidden from other rooms/areas\\)") %>%
str_remove("^Which type of standard health forms\\?/") %>%
str_remove("^How many cases were reported in the last 6 months for\\s") %>%
str_replace("^rape", "Rape") %>%
str_remove("\\(rape, sexual assault.*\\)$") %>%
str_remove("/domestic violence$") %>%
str_remove("\\(including physical.* or sexual abuse of children\\)") %>%
str_remove("^Does this facility.+trained and experienced in\\:?/") %>%
str_remove("^What resources are available for investigation and follow-up\\?/") %>%
str_remove(
"^What precautions are taken to ensure the safety of survivors of GBV and to protect their privacy\\?/"
) %>%
str_remove("^Does this organization offer the following amenities\\?/") %>%
str_remove("^What services are provided\\?/") %>%
str_remove("for temporary storage of forensic evidence$") %>%
str_remove(regex("^what is the price of the", ignore_case = TRUE)) %>%
str_remove("service in NGN\\?$") %>%
{
cap <- "MEDICINES AND ESSENTIAL SUPPLIES"
str_replace(., cap, str_to_lower(cap))
} %>%
str_trim %>%
str_squish
}
#' @rdname import_data
#'
#' @export
read_in_excel_data <-
function(dir,
state,
filetype,
drop.c = NULL,
drop.v = NULL,
nvars = NULL,
na.strings = "") {
if (!dir.exists(dir))
stop("The directory 'path' does not exist")
state <- match.arg(state, getOption("jgbv.project.states"))
filetype <- match.arg(filetype, c("Services", "Capacity"))
force(drop.c)
force(drop.v)
force(nvars)
# Select a pattern (if applicable for extracting the Excel file)
xl.rgx <- NULL
rgx <- getOption("jgbv.excelfile.regex")
if (!is.null(rgx))
xl.rgx <- rgx[filetype]
dir <- paste(dir, state, sep = '/')
xlf <- list.files(dir, xl.rgx, ignore.case = TRUE)
if (length(xlf) > 1L)
stop("More than one Excel file selected")
xlpath <- file.path(dir, xlf)
newcolnames <- .chooseNewVars(filetype)
df <-
.readRawAndLabel(xlpath, newcolnames, filetype, state, drop.c, drop.v, nvars, na = na.strings)
class(df) <- c(filetype, class(df))
df
}
# Reads in the raw data from Excel and also labels the new data frame
#' @importFrom dplyr select
#' @importFrom labelled var_label
#' @importFrom readxl read_xlsx
#' @importFrom purrr map_df
.readRawAndLabel <-
function(file,
new.var,
ftype,
state,
drop.col,
drop.var,
numvar,
na) {
stopifnot({
file.exists(file)
is.character(ftype)
})
df <- readxl::read_xlsx(file, na = na)
if (!is.null(drop.col))
for (col in drop.col)
df[[col]] <- NULL
if (!is.null(numvar))
df <- df[seq_len(numvar)]
labelled::var_label(df) <- names(df)
# For situations where there are missing variables from the
# imported data, we will add new variables that have only
# missing values.
if (!is.null(new.var)) {
missingVars <- isFALSE(is.null(drop.var))
if (missingVars) {
add.var <- new.var[drop.var]
new.var <- new.var[-drop.var]
}
names(df) <- new.var
if (!matchDfWithVarsLength(df, new.var))
stop("'new.var' must have as many elements as there are columns")
if (missingVars)
df[, add.var] <- NA
}
df
}
#' Cleanup Categorical Variables
#'
#' Converts relevant character vectors in the dataset into factors.
#'
#' @param data The data frame
#' @param newvars A named vector with the variable names.
#'
#' @return The modified data frame (if applicable).
#'
#' @export
fix_factors <- function(data, newvars) {
if (!is.data.frame(data))
stop("'data' should be of class 'data.frame'")
if (missing(newvars))
newvars <- new.varnames
if (!is.character(newvars))
stop("'newvars' should be atomic and of type 'character'")
data <-
.factorizeAndPreserveLabels(data, newvars[.yesNoVarnames()], c("Yes", "No"))
data <-
.factorizeAndPreserveLabels(data,
newvars[.yesNoVarnames("ynd")],
c("Yes", "No", "Don't know"))
data <- .factorizeAndPreserveLabels(data,
newvars[.yesNoVarnames("ynsd")],
c("Yes", "No", "Sometimes", "Don't know"))
.factorizeAndPreserveLabels(
data,
newvars["hf.type"],
c(
"Primary health care facility",
"Secondary health care facility",
"Tertiary health care facility",
"Other"
)
)
}
# Note that the value returned by this function is a vector of, not the
# the variable names themselves but the arbitrary names created as attributes
# or metadata to allow for subsequent variable names changes.
.yesNoVarnames <- function(type = c("yn", "ynd", "ynsd")) {
type <- match.arg(type)
switch(
type,
yn = c(
"serve.disabled",
"disabled.special",
"coc.copies",
"coc.confidentiality",
"coc.equity",
"has.focalperson",
"has.gbv.trained",
"has.refdir",
"choose.referral",
"coordination",
"support.for.court",
"police.followup",
"shelter.famfriendly",
"shelter.kidfriendly",
"shelter.support",
"shelter.new.support",
"child.docs"
),
ynd = c("standard.forms",
"data.is.stored",
"coc.signed"),
ynsd = c("computer.secured")
)
}
.factorizeAndPreserveLabels <- function(df, colnames, new.levels) {
stopifnot({
is.data.frame(df)
is.character(colnames)
is.character(new.levels)
})
for (fac in colnames) {
l <- var_label(df[[fac]])
df[[fac]] <- factor(df[[fac]], levels = new.levels)
var_label(df[[fac]]) <- l
}
df
}
#' Collect User Input for Data Importation
#'
#' Collects input from the user related to data for the
#' projects.
#'
#' @details Two variables are collected from the user: the state of interest
#' and the kind of data i.e. services or capacity assessment. The input can
#' be collected interactively or passed as a command line argument.
#'
#' @note When passing input via the shell or a script, \code{state} should
#' precede \code{filetype} in the argument list.
#'
#' @return A named list with 2 elements - one for each value collected.
#'
#' @importFrom tools toTitleCase
#'
#' @export
collect_input <- function() {
inputerr <- "Illegal input"
if (interactive()) {
state <- pick_one_state()
ftype <- c("Services", "Capacity")
ftypeIndex <- menu(ftype, TRUE, "Select one")
if (!ftypeIndex)
stop(inputerr)
filetype <- ftype[ftypeIndex]
}
else {
arglist <- commandArgs(trailingOnly = TRUE)
allstates <- getOption('jgbv.project.states')
res <- match(arglist, allstates)
stateIndex <- which(!is.na(res))
state <- arglist[stateIndex]
if (!(state %in% allstates))
stop(inputerr)
filetype <- arglist[1:2 != stateIndex] %>%
tools::toTitleCase()
}
list(state = state, filetype = filetype)
}
#' Repair Poorly Spelt Nigeria Local Government Areas in the Project
#'
#' A wrapper for \code{naijR::fix_region} for dealing with misspelt
#' Local Government Areas.
#'
#' @param x The character vector to be checked.
#' @param state The state whose LGAs will be checked.
#'
#' @importFrom naijR fix_region
#' @importFrom naijR fix_region_manual
#' @importFrom naijR is_lga
#' @importFrom naijR lgas
#'
#' @export
repair_lga_spelling <- function(x, state) {
stopifnot(state %in% getOption('jgbv.project.states'))
if (all(is_lga(x)))
return(x)
if (is.character(x))
x <- lgas(x, warn = FALSE)
x <- fix_region(x, quietly = TRUE)
if (state == "Taraba")
fix_region_manual(x, "Kurmi", "Kumi")
else if (state == "Niger") {
moya <- "Moya"
x %>%
fix_region_manual("Muya", moya) %>%
fix_region_manual("Muye", moya)
}
else x
}
#' Change Binary Values to Logical
#'
#' Set the Columns that have 1/0 values to T/F instead, notably those
#' for multiple response variables
#'
#' @param data The data frame
#'
#' @return The data frame, now modified (if applicable).
#'
#' @export
transform_bool_to_logical <- function(data) {
if (!is.data.frame(data))
stop("'data' must be an object of class 'data.frame'")
for (k in getOption("jgbv.multiresponse.regex")) {
colnams <- grep(k, names(data), value = TRUE)
for (col in colnams) {
if (is.character(data[[col]]))
data <- .modifyAndPreserveLabels(data, col, as.integer)
}
data <- .modifyAndPreserveLabels(data, colnams, as.logical)
}
data
}
# We need this function when modifying the variables because some
# of the functions in use do strip the labels
#' @importFrom labelled var_label
.modifyAndPreserveLabels <- function(df, x, f, nest = NULL, ...) {
stopifnot(is.data.frame(df))
local({
f <- if (is.character(f))
as.name(f)
else
substitute(f)
arglist <- list()
columnnames <- names(df)
for (i in x) {
if (is.character(i) && (!i %in% columnnames)) {
message("No variable named ", i)
next
}
l <- var_label(df[[i]])
cc <- as.call(list(f, quote(df[[i]])))
cl <- c(as.list(cc), ...)
cc <- as.call(cl)
if (!is.null(nest)) {
nestfun <- if (is.character(nest))
as.name(nest)
else
substitute(nest)
cc <- as.call(list(nestfun, cc))
}
if (is.numeric(i))
i <- columnnames[i]
message("Modifying ", i)
df[[i]] <<- eval(cc)
var_label(df[[i]]) <<- l
}
})
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.