Tools for creating and applying dictionaries of value-replacement pairs, to clean non-valid values of numeric, categorical, or date-type variables within a dataset.
Install from GitHub with:
# install.packages("remotes")
remotes::install_github("epicentre-msf/dbc")
library(dbc)
data(ll1) # example messy dataset
data(dict_categ1) # example dictionary of categorical vars and allowed values
ll1
#> # A tibble: 7 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 months F ? ten 43924 April 2020 43940 SENT HOME
#> 3 M104 29 <NA> - confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Year - Confirmed <NA> 2022-04-10 2020-04-12 Unknown Cure
#> 5 M685 ? Days F suspect Not sure <NA> <NA> 43918 <NA>
#> 6 M550 39.. Ans Homme Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
dict_clean_numeric <- check_numeric(
ll1,
vars = c("age", "contacts"), # cols that should be numeric
fn = as.integer # values not coercible by `fn` are non-valid
)
dict_clean_numeric
#> # A tibble: 4 × 5
#> variable value replacement query new
#> <chr> <chr> <chr> <chr> <lgl>
#> 1 contacts ten <NA> Non-valid number TRUE
#> 2 age ? <NA> Non-valid number TRUE
#> 3 contacts Not sure <NA> Non-valid number TRUE
#> 4 age 39.. <NA> Non-valid number TRUE
Normally one would do this step in a spreadsheet but we’ll do it in R here for simplicity.
dict_clean_numeric$replacement <- c(".na", "39", "10", ".na")
clean_numeric(
ll1,
vars = c("age", "contacts"),
dict_clean = dict_clean_numeric,
fn = as.integer
)
#> # A tibble: 7 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <int> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 months F ? NA 43924 April 2020 43940 SENT HOME
#> 3 M104 29 <NA> - confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Year - Confirmed NA 2022-04-10 2020-04-12 Unknown Cure
#> 5 M685 39 Days F suspect 10 <NA> <NA> 43918 <NA>
#> 6 M550 NA Ans Homme Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
Check for new non-valid numeric values, after incorporating previous cleaning
dict_clean_numeric_update <- check_numeric(
ll2, # same as ll1 but with 3 additional entries
vars = c("age", "contacts"),
dict_clean = dict_clean_numeric, # incorporate previous cleaning before checking
fn = as.integer,
return_all = TRUE # return original cleaning dict + new entries
)
dict_clean_numeric_update
#> # A tibble: 5 × 5
#> variable value replacement query new
#> <chr> <chr> <chr> <chr> <lgl>
#> 1 contacts ten .na Non-valid number NA
#> 2 age ? 39 Non-valid number NA
#> 3 contacts Not sure 10 Non-valid number NA
#> 4 age 39.. .na Non-valid number NA
#> 5 age 6 years <NA> Non-valid number TRUE
Manually specify replacement for new non-valid entry
dict_clean_numeric_update$replacement[5] <- "6"
Apply updated cleaning dictionary to updated dataset
clean_numeric(
ll2,
vars = c("age", "contacts"),
dict_clean = dict_clean_numeric_update,
fn = as.integer
)
#> # A tibble: 10 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <int> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 months F ? NA 43924 April 2020 43940 SENT HOME
#> 3 M104 29 <NA> - confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Year - Confirmed NA 2022-04-10 2020-04-12 Unknown Cure
#> 5 M685 39 Days F suspect 10 <NA> <NA> 43918 <NA>
#> 6 M550 NA Ans Homme Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
#> 8 M443 10 Months F Confirmed 26 <NA> 43900 43926 Cured
#> 9 M206 6 Years f Conf. 7 43921 43923 ? dead
#> 10 M701 56 Years M Suspected 39 17-03-2020 20-03-2020 05-04-2020 <NA>
dict_clean_categ <- check_categorical(
ll1,
dict_allowed = dict_categ1 # dictionary of categorical vars and their allowed values
)
dict_clean_categ
#> # A tibble: 7 × 4
#> variable value replacement new
#> <chr> <chr> <chr> <lgl>
#> 1 age_unit year <NA> TRUE
#> 2 age_unit ans <NA> TRUE
#> 3 exit_status cure <NA> TRUE
#> 4 sex - <NA> TRUE
#> 5 sex homme <NA> TRUE
#> 6 status ? <NA> TRUE
#> 7 status suspect <NA> TRUE
Again, we would normally do this step in a spreadsheet but we do it in R here for simplicity.
dict_clean_categ$replacement <- c(
"Years",
"Years",
"Cured",
".na",
"M",
".na",
"Suspected"
)
clean_categorical(
ll1,
dict_allowed = dict_categ1,
dict_clean = dict_clean_categ
)
#> # A tibble: 7 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 Months F <NA> ten 43924 April 2020 43940 Sent home
#> 3 M104 29 <NA> <NA> Confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Years <NA> Confirmed <NA> 2022-04-10 2020-04-12 Unknown Cured
#> 5 M685 ? Days F Suspected Not sure <NA> <NA> 43918 <NA>
#> 6 M550 39.. Years M Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.