# Data engineering related helper functions
# Get data ---------------------------------------------------------------
#' Return different format of the final aggregate results country summary
#'
#' `get.CME.UI.data` can read in "Rates & Deaths_Country Summary.csv" for any
#' indicators published so far and include Sex, Quantile in the output. If there
#' is only median (i.e. no Quantile column in the dataset) the function will
#' check if there is only one row per country. An example list of summary.csv
#' files directories could be obtained by \code{\link{load.final_dir}}. Choose
#' the `format` of the output dataset from `long`, `wide_q`(wide quantile),
#' `wide_year`, `wide_ind` (wide indicator) and `wide_get` (one column for rate
#' and one column for death)
#'
#' @param dir_file directory to the dataset to be read: directory to aggregate
#' final
#' @param c_iso country ISO3Code, default to NULL: returns all countries in the
#' dataset
#' @param year_range a vector of years, default to NULL: use all available years
#' @param idvars default to "`OfficialName`, `ISO3Code`", what id vars you want
#' to include
#' @param format Choose format among raw, long, wide_year, wide_ind, and
#' wide_get, default to "long". All the wide-format just `dcasts` the
#' long-format data, wide_get means two columns: rate and death
#' @param get default to "rate". Choose among "rate", "death", or "both"
#' @param round_digit digits to round estimates, default to `NULL`
#' @param quantile default to TRUE: return upper, median, lower; FALSE: only
#' median.
#' @param sex default as NULL, will determine sex from directory of `dir_file`
#' unless supplied
#'
#' @return a data.table (data.frame)
#' @export get.CME.UI.data
#'
#'
get.CME.UI.data <- function(
dir_file = NULL,
c_iso = NULL,
year_range = NULL,
get = "both", # "rate" / "death" / "both"
idvars = c("OfficialName", "ISO3Code"),
sex = NULL,
format = "long", # "raw"/ "long"/ "wide_year"/ "wide_ind"/ "wide_get"
round_digit = NULL,
quantile = TRUE
){
# load file
if(is.null(dir_file)) {
stop("Supply `dir_file`: directory to 'Country Summary.csv'")
} else {
dir_file <- unlist(dir_file)
if(file.exists(unlist(dir_file))){
dt <- data.table::fread(dir_file)
message("Data loaded: ", dir_file)
} else {
stop("Check: file doesn't exist: ", dir_file)
}
}
# cleaning up col names
dt <- dt[ISO3Code!="LIE"]
setnames(dt, gsub(" ", ".", colnames(dt)))
setnames(dt, gsub("-", ".", colnames(dt)))
# find the Quantile column:
if("Quintile"%in%colnames(dt))setnames(dt, "Quintile", "Quantile")
if("X"%in%colnames(dt))setnames(dt, "X", "Quantile")
# in case leave as blank, column will have names like V99, V101, etc
if(!"Quantile"%in%colnames(dt)){
columnV <- grep("V", colnames(dt), value = TRUE)
columnV <- columnV[which(nchar(columnV) %in% c(3,4))]
message("Assign this column as Quantile column: ", columnV[1])
setnames(dt, columnV[1], "Quantile")
}
# `row_per_iso` should be either 1 or 3, so `length(row_per_iso)` should be 1
row_per_iso <- unique(dt[,.N, by = ISO3Code][,N])
if(length(row_per_iso)!=1) warning("Should be one row each country? Check row per iso: ", paste(row_per_iso, collapse = ", "))
# Does the quantile column exist?
# If there is no Quantile column, then I assume there is only Median.
# There should one row per country
if(!"Quantile"%in%colnames(dt)){
# If there is no quantile column it means there should be median only
if(row_per_iso==1){
dt[, Quantile:= "Median"]
} else {
stop("There should be a `Quantile` column in the final results file?")
}
}
# find all available indicators
# get all the variables available in the datasets:
# e.g. c("X5q15", "X10q15", "X5q20")
available_inds <- grep(".2018", colnames(dt), value = TRUE, fixed = TRUE)
available_inds <- gsub(".2018", "", available_inds)
if(get == "rate"){
inds_wanted <- available_inds[!grepl("death|Death", available_inds)]
} else if (get == "death"){
inds_wanted <- available_inds[grepl("death|Death", available_inds)]
} else {
inds_wanted <- available_inds # otherwise just take all available inds
}
# find all available years
# available years
available_years <- grep(inds_wanted[1], colnames(dt), value = TRUE, fixed = TRUE)
available_years <- as.numeric(gsub(paste0(inds_wanted[1], "."), "", available_years))
if(!is.null(year_range)){
# so it is OK to supply years by mistake like year_range = 2000.4, match by
# flooring
year_range <- available_years[floor(available_years)%in%floor(as.numeric(year_range))]
if(length(year_range)==0){
message("Supplied `year_range` is not in available years.\n",
"Available years are between ", paste(range(available_years), collapse = " and "),
" --- will use all available years")
year_range <- available_years
}
} else {
message("`year_range` set to NULL: use all available years in the dataset: ",
paste(range(available_years), collapse = "-"))
year_range <- available_years
}
# subset isos if what's required is available
isos <- dt[, unique(ISO3Code)]
isos <- isos[isos!=""]
if(!is.null(c_iso)) {
c_iso <- c_iso[c_iso%in%isos]
} else {
c_iso <- isos
}
# determine sex from dir
if(is.null(dir_file)){
sex <- "Total"
} else {
if(is.null(sex)){
if(grepl("female", dir_file, ignore.case = TRUE)){
sex <- "Female"
} else if (grepl("male", dir_file, ignore.case = TRUE)) {
sex <- "Male"
} else {
sex <- "Total"
}
}
}
dt[, Sex:= sex]
if(!format%in%c("raw", "long", "wide_q", "wide_year", "wide_ind", "wide_get")){
message("Choose format among raw, long, wide_q, wide_get, wide_year and wide_ind, default to long")
format <- "long"
}
# the value variable
inds_wanted_year <- sort(do.call(paste, c(expand.grid(inds_wanted, year_range), sep = ".")))
dt[, (inds_wanted_year):= lapply(.SD, as.numeric), .SDcols = inds_wanted_year]
#
vars_wanted <- c(idvars, "Quantile", "Sex", inds_wanted_year)
dt1 <- dt[ISO3Code %in% c_iso, ..vars_wanted]
# quantile
if(!is.logical(quantile)){
quantile <- TRUE
message("quantile default to TRUE, choose between TRUE or FALSE.")
}
if(!quantile) dt1 <- dt1[Quantile=="Median"]
if(format == "raw") {
return(dt1)
} else {
dt_long <- data.table::melt(dt1, measure.vars = inds_wanted_year, value.name = "value", variable.factor = FALSE)
# dt_long[, Shortind:= gsub( "\\..*", "", variable )] # remove anything after .
dt_long[, Shortind:= substr(variable, 1, nchar(variable)-5)] # remove anything after .
# Deaths -> deaths
dt_long[, Shortind:= gsub("Deaths", "deaths", Shortind)]
message("Shortind: ", paste(dt_long[, unique(Shortind)], collapse = ", "))
dt_long[, Year:= gsub( ".*\\.", "", variable )] # remove anything after the last dot
dt_long[, Year:= as.numeric(Year)]
vars_kept <- c(idvars, "Shortind", "Year", "Quantile", "Sex", "value")
dt_long <- dt_long[, ..vars_kept]
data.table::setkey(dt_long, ISO3Code)
# round?
if(is.numeric(round_digit)) dt_long[, value:= roundoff(value, digits = round_digit)]
# remove NA
dt_long <- dt_long[!is.na(value)]
# can choose output format
# since everything is made from format long
if(format == "long"){
return(dt_long)
# wide all the indicators (incl. rates, deaths, if all selected)
} else if (format == "wide_ind") {
# e.g. OfficialName ISO3Code X Year U5MR NMR IMR
formula0 <- paste(paste(idvars, collapse = " + "), " + Sex + Year + Quantile ~ Shortind" )
dt_wide_ind <- data.table::dcast.data.table(dt_long, formula = formula0, value.var = "value")
return(dt_wide_ind)
# wide all the years, all the inds in one column
} else if (format == "wide_year") {
formula0 <- paste(paste(idvars, collapse = " + "), " + Shortind + Sex + Quantile ~ Year" )
dt_wide_year <- data.table::dcast.data.table(dt_long, formula = formula0, value.var = "value")
return(dt_wide_year)
} else if (format == "wide_q") {
formula0 <- paste(paste(idvars, collapse = " + "), "+ Shortind + Sex + Year ~ Quantile" )
dt_wide_quantile <- data.table::dcast.data.table(dt_long, formula = formula0, value.var = "value")
return(dt_wide_quantile)
} else {
# Two value columns : one for rate, one for deaths
dt_long[, Get:="Rate"]
dt_long[grepl("deaths", Shortind, ignore.case = TRUE), Get:="Deaths"]
if(data.table::uniqueN(dt_long$Get)==1){
"You only selected Rate or Death, not both, so long-format output is returned"
return(dt_long)
} else {
align_ind <- list(
"U5MR" = "Under.five",
"Under.five.deaths" = "Under.five",
"IMR" = "Infant",
"Infant.deaths" = "Infant",
"NMR" = "Neonatal",
"Neonatal.deaths" = "Neonatal"
)
dt_long[, Shortind:= get.match(Shortind, new_list = align_ind)]
formula0 <- paste(paste(idvars, collapse = " + "), "+ Shortind + Sex + Year + Quantile ~ Get" )
dt_wide_get <- data.table::dcast.data.table(dt_long, formula = formula0, value.var = "value")
return(dt_wide_get)
}
}
}
}
#' Read "Rates & Deaths_Country Summary.csv" and output long format
#'
#' This function is a simpler version of \code{\link{get.CME.UI.data}} and is
#' more straightforward to use: without many arguments, only output long format.
#' The arguments are aligned so this function could be replaced by
#' \code{\link{get.CME.UI.data}} in the code if needed.
#' \code{\link{get.CME.UI.data}} offers more options, including output format,
#' selecting id.vars. etc
#'
#' @param dir_file directory to Rates & Deaths_Country Summary.csv
#' @param year_range default to null, supply e.g. 1990:2019
#' @param sex default to NULL, will determine from file dir
#' @export
read.country.summary <- function(
dir_file, # directory to "Rates & Deaths_Country Summary.csv"
year_range = NULL, # e.g. 1990:2019
sex = NULL
){
if(!file.exists(dir_file)) stop("File doesn't exist: ", dir_file)
dt_cs <- fread(dir_file)[ISO3Code!="LIE"]
# clean up column names
setnames(dt_cs, gsub(" ", ".", colnames(dt_cs)))
setnames(dt_cs, gsub("-", ".", colnames(dt_cs)))
# find the Quantile column:
if("Quintile"%in%colnames(dt_cs))setnames(dt_cs, "Quintile", "Quantile")
if("X"%in%colnames(dt_cs)){
if(dt_cs$X[1]!=1) setnames(dt_cs, "X", "Quantile") # it could be a indexing column
}
# in case leave as blank, column will have names like V99, V101, etc
if(!"Quantile"%in%colnames(dt_cs)){
columnV <- grep("V", colnames(dt_cs), value = TRUE)
columnV <- columnV[which(nchar(columnV) %in% c(3,4))]
message("Assign this column as Quantile column: ", columnV[1])
setnames(dt_cs, columnV[1], "Quantile")
}
# get all the variables available in the datasets:
# e.g. c("X5q15", "X10q15", "X5q20")
vars <- grep(".2019", colnames(dt_cs), value = TRUE, fixed = TRUE)
vars <- gsub(".2019", "", vars)
# available years
available_years <- grep(vars[1], colnames(dt_cs), value = TRUE, fixed = TRUE)
available_years <- as.numeric(gsub(paste0(vars[1], "."), "", available_years))
if(!is.null(year_range)){
# so it is OK to supply years by mistake like year_range = 2000.4, match by
# flooring
year_range <- available_years[floor(available_years)%in%floor(as.numeric(year_range))]
if(length(year_range)==0){
message("Supplied `year_range` is not in available years.\n",
"Available years are between ", paste(range(available_years), collapse = " and "),
" --- will use all available years")
year_range <- available_years
}
} else {
message("`year_range` set to NULL: use all available years in the dataset: ", paste(range(available_years), collapse = "-"))
year_range <- available_years
}
vars_wanted <- do.call(paste0, expand.grid(vars, ".", year_range))
dt_cs <- dt_cs[, c("ISO3Code", "Quantile", vars_wanted), with = FALSE]
dt_cs[, (vars_wanted):=lapply(.SD, as.numeric), .SDcols = vars_wanted]
dt_long <- melt.data.table(dt_cs, id.vars = c("ISO3Code", "Quantile"),
variable.factor = FALSE)
dt_long[, Year := as.numeric(substr(variable, nchar(variable)-3, nchar(variable)))]
dt_long[, Shortind := substr(variable, 1, nchar(variable)-5)]
dt_long[, Shortind := gsub("Deaths", "deaths", Shortind)]
dt_long[, variable:= NULL]
# determine sex from dir
if(is.null(sex)){
if(grepl("female", dir_file, ignore.case = TRUE)){
sex <- "Female"
} else if (grepl("male", dir_file, ignore.case = TRUE)) {
sex <- "Male"
} else {
sex <- "Total"
}
}
dt_long[, Sex:= sex]
return(dt_long)
}
#' Read regional summary and output long format
#' @param dir_file directory to e.g. paste0("Rates & Deaths_UNICEFRegion.csv")
#'
#' @param year_range default to null, supply e.g. 1990:2019
#' @param add_regional_grouping if add `Regional_Grouping` column
#' @param sex default to NULL, will determine from file dir
#'
#' @export
read.region.summary <- function(
dir_file, # regional summary in aggregate results
year_range = NULL,# e.g. c(1990, 2019)
sex = NULL,
add_regional_grouping = FALSE
){
if(!file.exists(dir_file)) stop("File doesn't exist: ", dir_file)
dt_cs <- fread(dir_file)
if(!"Year" %in% colnames(dt_cs)) stop ("There is no `Year` column, are you reading the country summary?")
# What region is it?
if(grepl("SDG", dir_file)) Regional_Grouping <- "SDG"
if(grepl("UNICEF", dir_file)) Regional_Grouping <- "UNICEF"
if(grepl("WB", dir_file)) Regional_Grouping <- "World Bank"
# clean up the col names
setnames(dt_cs, gsub(" ", ".", colnames(dt_cs)))
setnames(dt_cs, gsub("-", ".", colnames(dt_cs)))
#
vars0 <- colnames(dt_cs)
vars0 <- vars0[!grepl("Population", vars0, ignore.case = TRUE)]
vars_wanted <- vars0[!vars0%in%c("Region", "Year")]
# available years
available_years <- sort(unique(dt_cs$Year))
if(!is.null(year_range)){
# so it is OK to supply years by mistake like year_range = 2000.4, match by
# flooring
year_range <- available_years[floor(available_years)%in%floor(as.numeric(year_range))]
if(length(year_range)==0){
message("Supplied `year_range` is not in available years.\n",
"Available years are between ", paste(range(available_years), collapse = " and "),
" --- will use all available years")
year_range <- available_years
}
} else {
message("`year_range` set to NULL: use all available years in the dataset: ", paste(range(available_years), collapse = "-"))
year_range <- available_years
}
dt_cs <- dt_cs[Year%in%year_range]
dt_cs[, (vars_wanted):=lapply(.SD, as.numeric), .SDcols = vars_wanted]
if("Region" %in% colnames(dt_cs)){
id_vars <- c("Region", "Year")
} else {
id_vars <- c("Year")
message("There is no `Region` column, assume this is world results")
}
dt_long <- melt.data.table(dt_cs[,..vars0], id.vars = id_vars, variable.factor = FALSE)
dt_long[grepl("upper", variable, ignore.case = TRUE), Quantile := "Upper"]
dt_long[grepl("median", variable, ignore.case = TRUE), Quantile := "Median"]
dt_long[grepl("lower", variable, ignore.case = TRUE), Quantile := "Lower"]
dt_long[, Shortind := gsub("X|.lower.bound|.upper.bound|.median", "", variable)]
dt_long[, Shortind := gsub("Deaths", "deaths", Shortind)]
dt_long[, variable:= NULL]
if(add_regional_grouping) dt_long[, Regional_Grouping:= Regional_Grouping]
dt_long[, Year:= floor(Year) + 0.5]
# determine sex from dir
if(is.null(sex)){
if(grepl("female", dir_file, ignore.case = TRUE)){
sex <- "Female"
} else if (grepl("male", dir_file, ignore.case = TRUE)) {
sex <- "Male"
} else {
sex <- "Total"
}
}
dt_long[, Sex:= sex]
return(dt_long)
}
#' Read one results.csv file and reformat into long-format
#'
#' @param dt_dir the single directory to a results.csv file
#' @param year_range year range desired, default to all years: `1931:2030`
#' @param q quantile desired, default to `c("Lower", "Median", "Upper")`
#' @param sex default to NULL, sex is determined from `dt_dir`, unless specified
#' @return long-format results.csv, all values in the "value" column
#' @export read.results.csv
read.results.csv <- function(
dt_dir,
year_range = 1931:2030,
q = c("Lower", "Median", "Upper"),
sex = NULL
){
id_vars <- c("ISO.Code", "Quantile", "Indicator")
if(grepl(".xlsx|.xls", dt_dir)){
# dt1 <- setDT(readxl::read_xlsx(dt_dir))
stop("dt_dir points to xlsx file.")
} else {
dt1 <- fread(dt_dir, header = TRUE)
}
dt1 <- dt1[Quantile %in% q, ]
# align the column names first
if("ISO Code"%in%colnames(dt1)) setnames(dt1, "ISO Code", "ISO.Code")
if(!is.numeric(year_range)){
year_range <- 1931:2030
message("`year_range` set to default: 1931-2030")
}
if(any(grepl("X", colnames(dt1)))){
value_vars <- paste0("X", year_range, ".5")
} else if (any(grepl(".5", colnames(dt1), fixed = TRUE))) { # no need for \\.5 if fixed = TRUE
value_vars <- paste0(year_range, ".5")
} else {
value_vars <- paste0(year_range)
}
value_vars <- value_vars[value_vars%in%colnames(dt1)] # only available years will be picked
dt1[, (value_vars):= lapply(.SD, as.numeric), .SDcols = value_vars]
# can also assign indicator for P-factor: P5/4/1_country.csv for under-five
# cannot assign automatically for older children
ind0 <- dt1$Indicator[1]
if(grepl("U5MR|P5", ind0)) dt1$Indicator <- "Under-five Mortality Rate"
if(grepl("IMR|P1", ind0)) dt1$Indicator <- "Infant Mortality Rate"
if(grepl("CMR|P4", ind0)) dt1$Indicator <- "Child mortality rate age 1-4"
if(grepl("10q5|5-14", dt_dir)) dt1$Indicator <- "Mortality rate age 5-14"
if(grepl("5q5|5-9", dt_dir)) dt1$Indicator <- "Mortality rate age 5-9"
if(grepl("5q10|10-14", dt_dir)) dt1$Indicator <- "Mortality rate age 10-14"
if(grepl("10q15|15-24", dt_dir)) dt1$Indicator <- "Mortality rate age 15-24"
if(grepl("5q15|15-19", dt_dir)) dt1$Indicator <- "Mortality rate age 15-19"
if(grepl("5q20|20-24", dt_dir)) dt1$Indicator <- "Mortality rate age 20-24"
# but overwrite for P-factor results if it is older children. Cannot
# automatically judge indicators since cannot distinguish 5-14 vs 15-24. Need
# to do it manually later
if(!(dt1$Indicator[1] %in% c("Under-five Mortality Rate",
"Infant Mortality Rate",
"Child mortality rate age 1-4")) & grepl("P", dt_dir)) dt1$Indicator <- ind0
vars_wanted <- c(id_vars, value_vars)
dt1_long <- melt.data.table(dt1[,..vars_wanted], measure.vars = value_vars,
variable.name = "Year", variable.factor = FALSE)
dt1_long[, value:=as.numeric(value)]
if(any(grepl("X", colnames(dt1)))) {
dt1_long[, Year:= floor(as.numeric(sub("X","",Year)))]
} else {
dt1_long[, Year:= floor(as.numeric(Year))]
}
# determine sex from dir
if(is.null(sex)){
if(grepl("female", dt_dir, ignore.case = TRUE)){
sex <- "Female"
} else if (grepl("male", dt_dir, ignore.case = TRUE)) {
sex <- "Male"
} else if (grepl("P", dt_dir)) {
sex <- "P_factor"
} else if (grepl("Sex ratio", dt_dir)) {
sex <- "Sex_Ratio"
} else {
sex <- "Total"
}
if(grepl("expected", dt_dir, ignore.case = TRUE)) sex <- paste(sex, "expected")
}
dt1_long[, Sex:= sex]
ind_list <- list(
"Under-five Mortality Rate" = "U5MR",
"Infant Mortality Rate" = "IMR",
"Child mortality rate age 1-4" = "CMR",
"Neonatal Mortality Rate" = "NMR",
"Mortality rate age 5-14" = "10q5",
"Mortality rate age 5-9" = "5q5",
"Mortality rate age 10-14" = "5q10",
"Mortality rate age 15-24" = "10q15",
"Mortality rate age 15-19" = "5q15",
"Mortality rate age 20-24" = "5q20"
)
dt1_long[, Shortind:= get.match(Indicator, new_list = ind_list)]
setkey(dt1_long, ISO.Code, Year)
dt1_long <- dt1_long[,.(ISO.Code, Shortind, Indicator, Sex, Year, Quantile, value)]
return(dt1_long)
}
#' Read and bind all "results.csv" into one file using `rbindlist`
#'
#' Use \code{\link{read.results.csv}} to read all "results.csv" files
#'
#' @param results_dir_list results_dir_list list of all the results.csv files to
#' read in and compare
#' @param year_range0 year range, default to years `1931:2021`
#' @param value_name name of the value, default to "Results"
#'
#' @return long-format results.csv for all indicators, all values in the
#' `value_name` column
#' @export read.all.results.csv
read.all.results.csv <- function(
results_dir_list,
year_range0 = 1931:2030,
value_name = "Results"
){
# a list of all the results files:
# combine original results
dt_results_2020 <- rbindlist(lapply(results_dir_list, read.results.csv, year_range = year_range0))
setnames(dt_results_2020, "value", value_name)
# dt_results_2020[, Quantile:= as.factor(Quantile)]
# if(!identical(levels(dt_results_2020$Quantile), c("Lower", "Median", "Upper"))) stop("Check Quantile levels: ", paste(levels(dt_results_2020$Quantile), collapse = ", "))
# dt_results_2020$Quantile <- factor(dt_results_2020$Quantile, levels = c("Median", "Lower", "Upper"))
return(dt_results_2020)
}
#' Load the "country.info.CME"
#'
#' Creates UNICEFReportRegion from UNICEFReportRegion1 and UNICEFReportRegion2
#'
#' @param year0 IGME round, e.g. 2021
#'
#' @import data.table
#' @export get.country.info.CME
#' @return dataset of country info
get.country.info.CME <- function(year0){
# dir_input <- file.path(get.workdir(year = year0), "input")
dc <- fread(file.path(dir_input, "country.info.CME.csv"))
# UNICEFReportRegion2 offers subregions for ECA and SSA, combined into UNICEFReportRegion
dc[, UNICEFReportRegion:= ifelse(UNICEFReportRegion2 == "", UNICEFReportRegion1, UNICEFReportRegion2)]
return(dc)
}
#' Load the "data_livebirths.csv", add ISO3Code and country names
#'
#' @param year0 IGME round, e.g. 2021
#' @export get.live.birth
#' @return dataset of live birth
get.live.birth <- function(year0){
dir_input <- file.path(get.workdir(year = year0), "input")
dc <- fread(file.path(dir_input, "country.info.CME.csv"))
dtlb <- fread(file.path(dir_input, "data_livebirths.csv"))
dtlb <- merge(dc[,.(ISO3Code, CountryName, OfficialName, UNCode)], dtlb, by.x = "UNCode", by.y = "uncode")
return(dtlb)
}
# General helper -----------------------------------------------------------
#' Capitalize first letter of each word in the vector
#' @param y vector of strings
#' @return a vector of strings with first letter capitalized
#' @export upper.first.letter
#' @examples upper.first.letter(c("aa","bb","cc"))
upper.first.letter <- function(y){
if(!is.character(y)) y <- as.character(y)
upper.first.letter0 <- function(y) {
c <- strsplit(y, " ")[[1]]
paste(toupper(substring(c, 1, 1)), substring(c, 2),
sep="", collapse=" ")
}
return(unname(sapply(y, upper.first.letter0)))
}
#' A label function to replace values by a given list in a variable
#'
#' You can provide a __new_list__ to define the values you wish to change in
#' this variable. Values not revised in the given list will be kept
#'
#' @param x a element or a vector
#' @param new_list if you supply a new list the function will use instead of the
#' default_labels
#' @param show_no_match default to FALSE, if TRUE will message unmatched
#' elements
#' @param no_line_break default to FALSE, if TRUE will remove line break from
#' the string
#'
#' @export get.match
#' @return an updated vector as character
get.match <- function(x,
new_list = NULL,
no_line_break = FALSE,
show_no_match = FALSE){
if(is.null(new_list)){
labs <- default_label
} else {
if(is.list(new_list)){
labs <- new_list
} else {
message("new_list must be a list. Still use the default list.")
labs <- default_label
}
}
if(!is.character(x)){
message("Coerse input into character.")
x <- as.character(x)
}
out <- rep(NA, length(x))
for (i in 1:length(x)){
if (is.null(labs[[ x[i] ]])){
out[i] <- x[i]
if(show_no_match) message("Notice from `get.match`: unmatched input: ", x[i])
}else{
out[i] <- labs[[ x[i] ]]
}
}
return(if(no_line_break)gsub("\n", "", out) else out)
}
#' Check and attach libraries. Install if not.
#' @importFrom utils install.packages
#' @param pkgs vector of packages
#' @return NULL
#' @export check.and.install.pkgs
#'
check.and.install.pkgs <- function(pkgs){
search_package <- sapply(pkgs, find.package, quiet = TRUE) # return a string or character(0)
new.packages <- pkgs[sapply(search_package, function(x)length(x)==0)]
if(length(new.packages)) install.packages(new.packages, dependencies = TRUE)
suppressPackageStartupMessages(invisible(lapply(pkgs, library, character.only = TRUE)))
}
# Find directories --------------------------------------------------------
# functions to search and get all major datasets
#' Return the IGME "Code" dir for a given year
#'
#' If `year` is 2020, returns the directory to Code folder in the 2020 Round
#' Estimation Dropbox folder
#' @param year YYYY
#' @return directory to input folder
#' @export get.workdir
get.workdir <- function(year = 2023){
USERPROFILE <- load_os_leading_dir()
work_dir <- file.path(USERPROFILE, paste0("/Dropbox/UN IGME Data/", year ," Round Estimation/Code"))
if(!dir.exists(work_dir))warning("Note that this directory doesn't exist: ", work_dir)
return(work_dir)
}
#' GReturn the IGME "Code" dir on SharePoint
#'
#' If `year` is 2020, returns the directory to Code folder in the 2020 Round
#' Estimation Dropbox folder
#' @param year YYYY
#' @return directory to "Code" folder
#' @export get.workdir.sharepoint
get.workdir.sharepoint <- function(year = 2024){
user_name <- Sys.getenv("USERNAME")
#
if(user_name == "lyhel"){
home_dir <- "D:/OneDrive - UNICEF/Documents - Child Mortality/UN IGME data"
} else if(user_name == "someone"){
# add you home directory to "Documents - Child Mortality/UN IGME data":
home_dir <- ""
} else {
stop("Please add your SharePoint home directory in function `get.workdir.sharepoint`")
}
work_dir <- file.path(home_dir, paste0(year, " Round Estimation/Code"))
if(!dir.exists(work_dir))warning("Note that this directory doesn't exist: ", work_dir)
return(work_dir)
}
#' Internal function: Check if `date` is a leap year
#'
#' @param date date
leap_year <- function(date){
if (is.numeric(date)) {
year <- date
}
else {
year <- year(date)
}
(year%%4 == 0) & ((year%%100 != 0) | (year%%400 == 0))
}
#' Calculate start, end and average date in decimal from starting/end dates
#'
#' @importFrom data.table year
#' @param date0 date for example: 2020-01-01
#' @param date1 date for example: 2020-12-31
#' @return a list of date start, date end, date average. for example: 2020,
#' 2020.997, 2020.497
#' @export get.ref.date
#' @examples get.ref.date("2020-01-01", "2020-12-31")
get.ref.date <- function(date0,
date1){
date0 <- as.Date(date0)
date1 <- as.Date(date1)
date_start <- get.numeric.date(date0)
date_end <- get.numeric.date(date1)
date_ave <- get.numeric.date(date0 + difftime(date1, date0)/2)
date_ave_d <- date0 + difftime(date1, date0)/2
return(list(date_start=date_start, date_end=date_end, date_ave=date_ave, date_ave_d = date_ave_d))
}
#' Transform date into numeric numbers like 2020.55
#'
#' @param date0 date for example: 2020-01-01
#' @return numeric date for example: 2020.014
#' @export
get.numeric.date <- function(date0){
get.numeric.date.core <- function(date0){
if(is.na(date0)) return(NA)
y1 <- data.table::year(date0)
n_days1 <- ifelse(leap_year(y1), 366, 365) # e.g. 2020 is a leap year with 366 days
first_day_of_year <- as.Date(paste(y1, 1, 1, sep = "-")) # use to count diff days
date_num <- as.double(difftime(date0, first_day_of_year))/n_days1 + y1
return(date_num)
}
# support vector input
unname(sapply(date0, get.numeric.date.core))
}
# Get database path -------------------------------------------------------
#' Show all file directories within the file directory `dir_file` and matched by
#' pattern `pattern0`
#'
#' Search only the files in the folder, match by `pattern0`, the search is not
#' recursive.
#' @param dir_file directory
#' @param pattern0 string to match file names
#' @param full_name list.files(full.names), if TRUE (default) returns full
#' directories, if FALSE, return only the file names
#' @return vector of matched file directories
#' @export get.file.name
get.file.name <- function(dir_file,
pattern0,
full_name = TRUE){
if(is.null(dir_file))message("dir_file is NULL. Please double check.")
# if(!dir.exists(dir_file))message("Check if dir_file exists: ", dir_file)
files <- list.files(dir_file)
files_full <- list.files(dir_file, full.names = TRUE)
return(if(full_name)files_full[which(grepl(pattern0, files))] else files[which(grepl(pattern0, files))])
}
#' Internal function to check if the input is date, and figure out which date is
#' the latest
#'
#' @param mydate a vector of dates
#' @return an integer returned by `which.max`
get.max.date <- function(mydate) {
align.date <- function(mydate){
if(!is.na(as.Date(mydate, "%Y-%m-%d"))){
mydate <- as.Date(mydate, "%Y-%m-%d")
} else if (!is.na(as.Date(mydate, "%Y%m%d"))){
mydate <- as.Date(mydate, "%Y%m%d")
} else {
mydate <- NA
}
return(mydate)
}
out <- sapply(mydate, align.date)
return(which.max(out))
}
#' Find out the latest date of all the master files in the directory using the
#' dates in file names
#' @param files file path
#'
find_latest_date <- function(files){
remove_string <- c("data_U5MR_|.csv|data_IMR_|data_NMR_|_5year|dataset_formodeling_|dataset_forplotting_|SexSpecific-entries_|data_residence_")
dates <- gsub(remove_string, "", files)
# screen for valid date string:
# dates <- c("2015", "20200804", "2020-08-01")
# return which.max e.g. 2L
get.max.date(dates)
}
#' Get the file directory with latest date in the filename
#'
#' @param dir_folder The directory to search for files
#' @param pattern_to_match Pattern used to match filename
#' @return file path to the dataset
#' @export get.dir_latest_file
get.dir_latest_file <- function(dir_folder, pattern_to_match){
files_full <- get.file.name(dir_file = dir_folder, pattern0 = pattern_to_match)
files <- get.file.name(dir_file = dir_folder, pattern0 = pattern_to_match, full_name = FALSE)
file_selected <- files_full[find_latest_date(files)]
if(length(file_selected)!=0){
message(paste(pattern_to_match, "dataset chosen: \n", file_selected))
return(file_selected)
} else {
message("No corresponding dataset found in: \n ", dir_folder)
return(NULL)
}
}
#' Get the U5MR master dataset directory
#'
#' @param workdir The directory to IGME Code folder, e.g. ".../202x Round
#' Estimation/Code"
#' @param pattern_to_match default to "data_U5MR", but can be used generally
#'
#' @return file path to the master dataset
#' @export get.dir_U5MR
get.dir_U5MR <- function(workdir, pattern_to_match = "data_U5MR"){
workdir_input <- file.path(workdir, "input")
files_full <- get.file.name(dir_file = workdir_input, pattern0 = pattern_to_match)
files <- get.file.name(dir_file = workdir_input, pattern0 = pattern_to_match, full_name = FALSE)
file_selected <- files_full[find_latest_date(files)]
if(length(file_selected)!=0){
message(paste("U5MR master dataset chosen: \n", file_selected))
return(file_selected)
} else {
message("No corresponding dataset found in: \n ", workdir_input)
return(NULL)
}
}
#' Get the IMR master dataset directory
#'
#' @param workdir The directory to IGME input folder, e.g. ".../2020 Round
#' Estimation/Code/input/"
#' @return file path to the master dataset
#' @export get.dir_IMR
get.dir_IMR <- function(workdir){
workdir_input <- file.path(workdir, "input")
files_full <- get.file.name(dir_file = workdir_input, pattern0 = "data_IMR")
files <- get.file.name(dir_file = workdir_input, pattern0 = "data_IMR", full_name = FALSE)
file_selected <- files_full[find_latest_date(files)]
if(length(file_selected)!=0){
message(paste("IMR master dataset chosen: \n", file_selected))
return(file_selected)
} else {
message("No corresponding dataset found in: \n ", workdir_input)
return(NULL)
}
}
#' Get the NMR master dataset directory
#'
#' Compare to \code{\link{get.dir_U5MR}}, there is need to supply workdir since
#' the dataset location is fixed at "/NMR/data"
#'
#' @param y5 to get the 5-year dataset or not
#' @param dir_IGME_NMR default to "Dropbox/NMR/data"
#'
#' @return file path to the master dataset
#' @export get.dir_NMR
get.dir_NMR <- function(
y5 = FALSE,
dir_IGME_NMR = NULL
){
if(is.null(dir_IGME_NMR)){
dir_IGME_NMR <- file.path(load_os_leading_dir(), "Dropbox/NMR/data")
}
if(y5){
files_full <- get.file.name(dir_file = dir_IGME_NMR, pattern0 = "data_NMR_")
files_full <- files_full[grepl("5year", files_full)]
files <- get.file.name(dir_file = dir_IGME_NMR, pattern0 = "data_NMR_", full_name = FALSE)
files <- files[grepl("5year", files)]
} else {
files_full <- get.file.name(dir_file = dir_IGME_NMR, pattern0 = "data_NMR_")
files_full <- files_full[!grepl("5year", files_full)]
files <- get.file.name(dir_file = dir_IGME_NMR, pattern0 = "data_NMR_", full_name = FALSE)
files <- files[!grepl("5year", files)]
}
file_selected <- files_full[find_latest_date(files)]
if(length(file_selected)!=0){
message(paste("NMR master dataset chosen: \n", file_selected))
return(file_selected)
} else {
message("No corresponding dataset found in: \n ", workdir)
return(NULL)
}
}
#' Get the sex-specific master dataset directory
#'
#' Compare to \code{\link{get.dir_U5MR}}, there is need to supply workdir since
#' the dataset location is fixed at "/CMEgender2015/Database"
#'
#' @param plotting to get the dataset for plotting (if TRUE) or dataset for
#' modeling (if FALSE)
#' @param dir_IGME_gender default to "/Dropbox/CMEgender2015/Database"
#' @return file path to the master dataset
#' @export get.dir_gender
get.dir_gender <- function(
plotting = TRUE,
dir_IGME_gender = NULL
){
if(is.null(dir_IGME_gender)){
if(plotting){
dir_IGME_gender <- file.path(load_os_leading_dir(),"/Dropbox/CMEgender2015/Database")
} else {
dir_IGME_gender <- file.path(load_os_leading_dir(),"/Dropbox/CMEgender2015/data/interim")
}
}
if(plotting){
files_full <- get.file.name(dir_file = dir_IGME_gender, pattern0 = "dataset_forplotting")
files <- get.file.name(dir_file = dir_IGME_gender, pattern0 = "dataset_forplotting", full_name = FALSE)
}else{
files_full <- get.file.name(dir_file = dir_IGME_gender, pattern0 = "dataset_formodeling")
files <- get.file.name(dir_file = dir_IGME_gender, pattern0 = "dataset_formodeling", full_name = FALSE)
}
file_selected <- files_full[find_latest_date(files)]
if(length(file_selected)!=0){
message(paste("Sex-specific master dataset chosen: \n", file_selected))
return(file_selected)
} else {
message("No corresponding dataset found in: \n ", workdir)
return(NULL)
}
}
# For CMRJack results directories
#' Get optimal file directory from `Output CMRJack` folder
#' @param cname country name
#' @param surveytype folder names like "DHS", "MICS", "NDHS",...
#' @param year year of the survey, e.g. 2015
#' @return xlsx file directory
#' @export get.opt.dir
#' @examples
#' \dontrun{
#' get.opt.dir("Zimbabwe", "DHS", 2015)
#' }
get.opt.dir <- function(
cname,
surveytype = "DHS",
year = NULL){
cname <- gsub(" ", "", cname)
dir_opt <- file.path(load_os_leading_dir(), "Dropbox/IGME Data/Output CMRJack/All/BH", surveytype, "Real/Optimal")
files <- get.file.name(dir_file =dir_opt, pattern0 = cname)
if(any(grepl(" CY ", files))) files <- grep(" CY ", files, value = TRUE)
if(!is.null(year))files <- grep(year, files, value = TRUE)
return(files)
}
#' Get raw file directory from `Output CMRJack` folder
#' @param cname country name
#' @param surveytype folder names like "DHS", "MICS", "NDHS",...
#' @param year year of the survey, e.g. 2015
#' @return xlsx file directory
#' @export get.raw.dir
#' @examples
#' \dontrun{
#' get.raw.dir("Zimbabwe", "DHS", 2015)
#' }
get.raw.dir <- function(cname, surveytype = "DHS", year = NULL){
cname <- gsub(" ", "", cname)
dir_opt <- file.path(load_os_leading_dir(), "Dropbox/IGME Data/Output CMRJack/All/BH", surveytype, "Real/Raw")
files <- get.file.name(dir_file =dir_opt, pattern0 = cname)
if(any(grepl(" CY ", files))) files <- grep(" CY ", files, value = TRUE)
if(!is.null(year)) files <- grep(year, files, value = TRUE)
return(files)
}
# extra
#' Adjust the file dir if the lash is not right or the Dropbox username is not
#' right
#'
#' @param dir0 file directory not output for now
#' @export revise.path
revise.path <- function(dir0){
# if there is backslack, replace it
if(grep("\\\\", dir0)) dir <- gsub("\\\\", "\\/", dir0)
# replace username if it is not right
if(!grepl(Sys.getenv("USERNAME"), dir)) dir <- file.path(load_os_leading_dir(),"Dropbox", sub("^.*Dropbox", "", dir))
if(!file.exists(dir)) stop("check if dir exists: ", dir)
return(dir)
}
#' check system, return . Used as nternal function.
#'
#' @return "Windows", "OSX", or "Linus"
get_os <- function(){
sysinf <- Sys.info()
if (!is.null(sysinf)){
os <- sysinf['sysname']
if (os == 'Darwin')
os <- "osx"
} else { ## mystery machine
os <- .Platform$OS.type
if (grepl("^darwin", R.version$os))
os <- "osx"
if (grepl("linux-gnu", R.version$os))
os <- "linux"
}
return(tolower(os))
}
#' Search for file paths matched by part of the file name
#'
#' Search for files containing the `file_name_string` in all sub-folders in
#' `target.dir`, and list files containing the `file_name_string`
#'
#' @param target.dir target directory
#' @param file_name_string e.g. "data_U5MR"
#' @param full_path full path or not
search.for.file <- function(target.dir, file_name_string, full_path = FALSE){
n <- which(grepl(file_name_string, list.files(target.dir, recursive = TRUE)))
list.files(target.dir, recursive = TRUE, full.names = full_path)[n]
}
#' Get leading path in file directories depending on operation system (Mac OSX or Windows)
#'
#' @return "Users/<username>" or "C:/Users/<username>"
#' @export load_os_leading_dir
#'
#' @examples load_os_leading_dir()
#'
load_os_leading_dir <- function(){
user <- Sys.info()[["user"]]
os <- get_os()
if(!os %in% c("windows", "osx")) warning ("For now only defined for Windows and Mac OSX")
leading_path <- if(os == "osx") file.path("/Users", user) else Sys.getenv("USERPROFILE")
return(leading_path)
}
# Calculation -------------------------------------------------------------
# functions to do calculations
#' Calculate 5q0 from 1q0 and 4q1
#'
#' @param q1 1q0
#' @param q4 4q1
#' @param use_q_not_rate default to TRUE, if TRUE q1, q4 are probability of
#' dying dx/lx, if FALSE are mortality rate per 1,000
#'
#' @return 5q0
#' @export
calculate.5q0 <- function(q1, q4, use_q_not_rate = TRUE){
if(use_q_not_rate){
if(q1[1] > 1 | q4[1] > 1) message("Double check if q1 and q4 are probabilities (< 1) or mortality rate")
q <- 1 - (1 - q1) * (1 - q4)
} else {
q <- (1 - (1 - q1 / 1E3) * (1 - q4 / 1E3)) * 1E3
}
return(q)
}
#' Calculate 4q1 from 5q0 and 1q0
#'
#' Calculate 4q1 from 5q0 and 1q0 or any part from the whole
#' e.g. NMR from IMR and PNMR
#'
#' @param q5 5q0
#' @param q1 1q0
#' @param use_q_not_rate default to TRUE, if TRUE q5, q1 are probability of
#' dying dx/lx, if FALSE are mortality rate per 1,000
#'
#' @return 4q1
#' @export
#'
calculate.4q1 <- function(q5, q1, use_q_not_rate = TRUE){
if(use_q_not_rate){
if(q1[1] > 1 | q5[1] > 1) message("Double check if q1 and q4 are probabilities (< 1) or mortality rate")
q <- 1 - (1 - q5) / (1 - q1)
} else {
q <- (1 - (1 - q5 / 1E3) / (1 - q1 / 1E3)) * 1E3
}
return(q)
}
#' Get the logarithmic annual rate of reduction, expressed as a percentage
#' reduction
#'
#' The formula for ARR is as follows: ARR = ln(rate2/rate1)/(t2-t1)*-100, where
#' t1 and t2 are years and t1<t2. Return unrounded results
#'
#'
#' @param dt wide data by year, year1 and year2 shall be columns like 1990, 2000
#' @param year1 year1 where year1 < year2
#' @param year2 year2 where year1 < year2
#' @return dt with added columns named like "1990-2000"
#' @export calculate.arr
#' @examples
#' \dontrun{
#' dt_final_ARR <- dt_final[Year%in%c(1990,2019)]
#' dt_final_ARR_w <- dcast(dt_final_ARR, ISO3Code ~ Year)
#' dt_final_ARR_w <- calculate.arr(dt_final_ARR_w, 1990, 2019)
#' }
#'
calculate.arr <- function(dt, year1, year2){
get.year <- function(years){
if(grepl("\\.5", years)) years <- gsub("\\.5", "", years)
as.numeric(gsub("[^\\d]+", "", years, perl = TRUE))
}
# use year1 and year2 value to get ARR
dt[, arr:= log(get(as.character(year2))/get(as.character(year1)))/(get.year(year2)-get.year(year1))*-100]
setnames(dt, "arr", paste0(year1, "-", year2))
return(dt)
}
#' Calculate percentage decline
#'
#' Calculate percentage decline and return unrounded results
#' @inheritParams calculate.arr
#' @return dt with added columns named like "PD1990-2000"
#' @export calculate.pd
#'
calculate.pd <- function(dt, year1, year2){
get.year <- function(years){
if(grepl("\\.5", years)) years <- gsub("\\.5", "", years)
as.numeric(gsub("[^\\d]+", "", years, perl = TRUE))
} # use year1 and year2 value to get ARR
dt[, pd:= (get(as.character(year2))/get(as.character(year1)) - 1)*-100]
setnames(dt, "pd", paste0("PD", year1, "-", year2))
dt
}
#' A rounding function that rounds off numbers in the conventional way: rounds 0.5 to 1
#'
#' Instead of in R by default round(0.5) = 0, roundoff(0.5, 0) = 1
#'
#' @param x the number
#' @param digits digits, default to 2
#' @return rounded numeric vector
#' @export roundoff
roundoff <- function(#
x, digits = 2
) {
if(!is.numeric(x)) message("x coerse to numeric. ")
x <- as.numeric(x)
z <- trunc(abs(x)*10^digits + 0.5)
z <- sign(x)*z/10^digits
return(z)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.