R/F_readdata.R

Defines functions verifyDenominators extractDenominators InternalFixEndYear PlotDataAvailability InternalPlotPropsDataAvailability GetObsCounts InternalInternalWhichSubreg InternalWhichReg InternalMakeCountryNamesShort InternalFixNumericClusterCodes InternalGetCountryCodes InternalGetRegionInfoForCountry InternalGetRegionInfoGeneral InternalRegExpsInputCells InternalGetDFColNames InternalRegExpsInputCols InternalFixRange ReadDataAll ReadFileAggregates PreprocessClassification PreprocessData CheckDataRange CheckDataMissing PrecheckData

###----------------------------------------------------------------------------------
### Leontine Alkema
### F_readdata.R
###
### MODIFIED by Mark Wheldon starting 2016-02-22 for extension to all women.
###----------------------------------------------------------------------------------
### Contains:
### general math functions (MOVED TO F_general_functions.R)
### function to read in data, and make summary tables (plots)
### functions to read/summarize regional info
### function to change country names

###----------------------------------------------------------------------------------
##' Check input data for errors.
##'
##' A wrapper for \code{\link{PreprocessData}} for use as a stand-alone function. Default for \code{return.processed.data.frame = FALSE}.
##'
##' @param write.model.fun
##' @param messages
##' @param warnings
##' @param return.processed.data.frame
##' @return
##' @author
##' @noRd
PrecheckData <- function(# Pre-check contraceptive prevalence data
  ### Pre-process contraceptive prevalence data
  data.csv = NULL, ##<< If \code{NULL}, data set included in package is used.
  ## To use alternative data set, use \code{data.csv = .../dataCPmodel.csv}, where ... refers to the file path where file is located.
  iso.select = NULL, ##<< If \code{NULL}, data of all countries/subpopulations in data set are read in, else
  ## only data of countries/subpopulations with selected ISO code is read in. # change JR, 20131104
  write.model.fun = "WriteModel"
  ) {
    packageStartupMessage("\n\n>>---------------------------------------------------------------------------->>\nChecking '", data.csv, "'\n>>---------------------------------------------------------------------------->>\n", sep = "")

    ## ## Set warnings to be saved till end
    ## oo <- options()
    ## on.exit(options(oo))
    ## options(warn = 1)

    PreprocessData(data.csv = data.csv
                  ,iso.select = iso.select
                  ,write.model.fun = write.model.fun
                  ,print.messages = TRUE
                  ,print.warnings = TRUE
                  ,return.processed.data.frame = FALSE
                   )

    ## ## WARNINGS
    ## cat("\n"); warnings()

    packageStartupMessage("\n<<----------------------------------------------------------------------------<<\nEND OF Checking '", data.csv, "'\n                NO PROBLEMS DETECTED BUT CHECK WARNINGS (IF ANY ABOVE)                \n<<----------------------------------------------------------------------------<<\n", sep = "")
}


CheckDataMissing <- function(col_name, data_frame, data_frame_name) {
        if(col_name %in% colnames(data_frame)) {
            miss.vals <- is.na(data_frame[, col_name])
            if(any(miss.vals)) {
                stop("There are missing values in '", data_frame_name, "' column '", col_name, "' at row(s) (ignoring header): ", paste0(which(miss.vals), collapse = ", "))
            }
        }
        return(invisible())
}

CheckDataRange <- function(col_name, data_frame, data_frame_name, range = c(0, 100)) {
        if(col_name %in% colnames(data_frame)) {
            outside.range <- data_frame[, col_name] < range[1] | data_frame[, col_name] > range[2]
            if(any(outside.range, na.rm = TRUE)) {
                stop("Values in '", data_frame_name, "' column '", col_name, "' fall outside the range '[",
                     paste(range, collapse = ", "), "]' at row(s) (ignoring header): ",
                     paste0(which(outside.range), collapse = ", "))
            }
        }
        return(invisible())
    }


###----------------------------------------------------------------------------------
PreprocessData <- function(# Pre-process contraceptive prevalence data
  ### Pre-process contraceptive prevalence data
  data.csv = NULL, ##<< If \code{NULL}, data set included in package is used.
  ## To use alternative data set, use \code{data.csv = .../dataCPmodel.csv}, where ... refers to the file path where file is located.
  iso.select = NULL, ##<< If \code{NULL}, data of all countries/subpopulations in data set are read in, else
  ## only data of countries/subpopulations with selected ISO code is read in. # change JR, 20131104
  write.model.fun = "WriteModel",
  print.messages = TRUE,
  print.warnings = TRUE,
  return.processed.data.frame = TRUE,
  marital.group = NULL
  ) {

    ## Read in input data
    data.raw <-
        read.csv(file = data.csv, header = TRUE, as.is = TRUE
                ,stringsAsFactors = FALSE, strip.white = TRUE
                 )

    cat(names(data.raw), "\n")
    cat("no. rows in dataset: ", nrow(data.raw), "\n")

    ## -------* Column Names

    ## Create column name regexps and cell value regexps
    data.col.names <- InternalRegExpsInputCols(write.model.fun = write.model.fun)
    data.cell.vals <- InternalRegExpsInputCells()

    ## Check for duplicates in column names
    dup_cols <- sapply(data.col.names$regex, function(z) sum(grepl(z, names(data.raw))) > 1)
    dup <- any(dup_cols)
    if(dup) stop(paste0("Column names in '", data.csv, "' are not unique enough. Check\n\t"
                        ,paste(names(dup_cols)[dup_cols], collapse = "\n\t")))

    ## Make sure data frame columns get named correctly
    for(i in 1:length(data.col.names$regex)) {
        names(data.raw)[grepl(pattern = data.col.names$regex[i], x = names(data.raw))] <-
            data.col.names$df.names[i]
    }

    ## Check that all required columns are present
    not.there <- !(data.col.names$df.names[data.col.names$required] %in% names(data.raw))
    if(sum(not.there) > 0) {
        miss.cols <-
            paste(data.col.names$df.names[data.col.names$required][not.there], collapse = ", ")
        stop("The following required columns are not in the input data file: ", miss.cols)
    }

    ## 'data.raw' column names used in the remainder
    country_col_name <-
        InternalGetDFColNames("country",
                              df = data.raw, regexp.list = data.col.names)
    iso_code_col_name <-
        InternalGetDFColNames("iso.code",
                              df = data.raw, regexp.list = data.col.names)
    abs_probe_col_name <-
        InternalGetDFColNames("absence.of.probing.questions.bias.1.",
                              df = data.raw, regexp.list = data.col.names)
    start_year_col_name <-
        InternalGetDFColNames("start.year",
                              df = data.raw, regexp.list = data.col.names)
    end_year_col_name <-
        InternalGetDFColNames("end.year",
                              df = data.raw, regexp.list = data.col.names)
    in_union_col_name <-
        InternalGetDFColNames("in.union",
                              df = data.raw, regexp.list = data.col.names)
    cp_mod_col_name <-
        InternalGetDFColNames("contraceptive.use.modern",
                              df = data.raw, regexp.list = data.col.names)
    cp_trad_col_name <-
        InternalGetDFColNames("contraceptive.use.traditional",
                              df = data.raw, regexp.list = data.col.names)
    cp_any_col_name <-
        InternalGetDFColNames("contraceptive.use.any",
                              df = data.raw, regexp.list = data.col.names)
    unmet_col_name <-
        InternalGetDFColNames("unmet",
                              df = data.raw, regexp.list = data.col.names)
    age_range_col_name <-
        InternalGetDFColNames("age.range",
                              df = data.raw, regexp.list = data.col.names)
    se_logr_mod_no_use_col_name <-
        InternalGetDFColNames("se.logr.modern.nouse",
                              df = data.raw, regexp.list = data.col.names)
    se_logr_trad_no_use_col_name <-
        InternalGetDFColNames("se.logr.trad.nouse",
                              df = data.raw, regexp.list = data.col.names)
    se_logr_unmet_no_need_col_name <-
        InternalGetDFColNames("se.logr.unmet.noneed",
                              df = data.raw, regexp.list = data.col.names)
    note_on_methods_col_name <-
        InternalGetDFColNames("note.on.methods",
                              df = data.raw, regexp.list = data.col.names)
    exclude_1_is_yes_col_name <-
        InternalGetDFColNames("exclude.1.is.yes",
                              df = data.raw, regexp.list = data.col.names)
    data_series_type_col_name <-
        InternalGetDFColNames("data.series.type",
                              df = data.raw, regexp.list = data.col.names)
    catalog_id_col_name <-
        InternalGetDFColNames("catalog.id",
                              df = data.raw, regexp.list = data.col.names)

    ## -------* Value Checks

    ## Absence of probing questions must be 0 or 1
    if(!isTRUE(all(data.raw[,abs_probe_col_name] %in% c(0,1))))
        stop("Column '", abs_probe_col_name, "' in the input file must contain only 0s and 1s.")

    ## Start.year and End.year must not be missing
    sapply(c(start_year_col_name, end_year_col_name),
           FUN = CheckDataMissing,
           data_frame = data.raw,
           data_frame_name = data.csv)

    ## Cell values
    sapply(c(cp_mod_col_name, cp_trad_col_name, unmet_col_name),
           CheckDataRange,
           data_frame = data.raw,
           data_frame_name = data.csv,
           range = c(0, 100))

    ## -------* Filter the correct marital group

    if(is.null(marital.group)) {
        marital.group <- "MWRA"
        warning("'marital.group' is 'NULL', setting 'marital.group' to 'MWRA'.")
    }
    idx_iu <- !(data.raw[,in_union_col_name] %in% c(0, 1))
        if(sum(idx_iu) > 0) {
            stop("'In.union' must only have values '0', or '1'. The following rows are not compliant:\n",
                 paste(which(idx_iu), collapse = ", "))
        }

    if (marital.group == "MWRA") {
        rows_in_orig_input <- which(data.raw[, in_union_col_name] == 1)
        data.raw <- data.raw[rows_in_orig_input,]
    } else if (marital.group == "UWRA") {
        rows_in_orig_input <- which(data.raw[, in_union_col_name] == 0)
        data.raw <- data.raw[rows_in_orig_input,]
    } else {
        stop("marital group must be NULL, 'MWRA' or 'UWRA'")
    }

    cat("no. rows in dataset matching marital group ", marital.group, ": ", nrow(data.raw), "\n", sep = "")

    ## -------* Delete rows with no ISO and country name

    idx.keep <- !(is.na(data.raw[, country_col_name]) | is.na(data.raw[, iso_code_col_name]))
    if(sum(!idx.keep, na.rm = TRUE) > 0) {
        if(print.messages) message("Deleting rows with no ISO and country name:\nThe following rows in '", data.csv, "' removed because 'ISO.code' or 'Country' missing: ", paste0(rows_in_orig_input[!idx.keep], collapse = ", "), "\n")
        }
    data.raw <- data.raw[idx.keep,]
    rows_in_orig_input <- rows_in_orig_input[idx.keep]

    if (!is.null(iso.select)) {
        if (all(grepl("[[:digit:]]", iso.select))) {
            if(sum(as.numeric(data.raw[, iso_code_col_name]) %in% iso.select) > 0) {
                data.raw <- data.raw[as.numeric(data.raw[, iso_code_col_name]) %in% iso.select, ]
            } else {
                stop("One country model cannot be run for ISO ", iso.select, " because there are not observations for it in '", data.csv, "'.")
            }
        } else if (all(grepl("[[:alpha:]]", iso.select))) {
            stop("Three-letter country codes supplied as argument to 'iso.select' but these are not required in the input data file (only in the classifications file). Supply numeric iso codes instead")
        }

    if(print.messages) message(paste0("Note: Only data for ISO ", paste(iso.select, collapse = ", ")
             ," (", data.raw[as.numeric(data.raw[, iso_code_col_name]) %in% iso.select, "Country"][1], ") is read in.\n"))
  }

    ## -------* Process CP Values

    data.raw$rounded.up <- rep(FALSE, nrow(data.raw))

    ## -------** Round to 9 DPs

    data.raw[, cp_mod_col_name] <-
        round(as.numeric(data.raw[, cp_mod_col_name]), 9)
    data.raw[, cp_trad_col_name] <-
        round(as.numeric(data.raw[, cp_trad_col_name]), 9)
    data.raw[, cp_any_col_name] <-
        round(as.numeric(data.raw[, cp_any_col_name]), 9)

    ## -------** Only Mod or Only Trad

    ## There are two ways to handle observations which only measure CP Modern (no CP Trad, no CP Any).
    ## 1) Model the logit of CP Modern with a new component in the data model.
    ## 2) Assume the CP Mod observation is for CP Any: set CP Mod to
    ## the observed minus 0.001 (0.1%), set CP Trad to 0.001.

    ## Mark NAs
    trad.na <- is.na(data.raw[, cp_trad_col_name]) | is.nan(data.raw[, cp_trad_col_name])
    mod.na <- is.na(data.raw[, cp_mod_col_name]) | is.nan(data.raw[, cp_mod_col_name])
    any.na <- is.na(data.raw[, cp_any_col_name]) | is.nan(data.raw[, cp_any_col_name])

    if(ModelFunctionModOnlyObs(write.model.fun)) {
        ## 1)
        mod.only <- any.na & trad.na & !mod.na
        if(sum(mod.only) > 0) {
            data.raw[, cp_trad_col_name][mod.only] <- NA
            data.raw[, cp_any_col_name][mod.only] <- NA
        }
    } else {
        ## 2)
        mod.only <- rep(FALSE, nrow(data.raw)) #need this for 'Exclusions'
        trad.round <- trad.na & !mod.na
        if(sum(trad.round) > 0) {
            data.raw[, cp_trad_col_name][trad.round] <- 0.1
            data.raw$rounded.up <- trad.round | data.raw$rounded.up
            message(paste0("There are ", sum(trad.round, na.rm = TRUE), " observations with missing prevalence for CP Traditional but non-missing for modern. \n\tIn the input data file, these are in rows (ignoring header):\n\t\t",
                           paste(rows_in_orig_input[trad.round], collapse = ", "), "\n\t\tThese will be ROUNDED UP to 0.1 percent."))
        }
        mod.round <- mod.na & !trad.na
        if(sum(mod.round) > 0) {
            data.raw[, cp_mod_col_name][mod.round] <- 0.1
            data.raw$rounded.up <- mod.round | data.raw$rounded.up
            message(paste0("There are ", sum(mod.round, na.rm = TRUE), " observations with missing prevalence for CP Modern but non-missing for traditional. \n\tIn the input data file, these are in rows (ignoring header):\n\t\t",
                           paste(rows_in_orig_input[mod.round], collapse = ", "), "\n\t\tThese will be ROUNDED UP to 0.1 percent."))
        }
    }

    ## -------** Round Small Values UP to 0.1 percent

    modern.zero <-
        !is.na(data.raw[, cp_mod_col_name]) &
        (round(data.raw[, cp_mod_col_name], 9) < 0.1)
    if(sum(modern.zero, na.rm = TRUE) > 0) {
        data.raw[, cp_mod_col_name][modern.zero] <- 0.1
        data.raw$rounded.up <- modern.zero | data.raw$rounded.up
        message(paste0("There are ", sum(modern.zero, na.rm = TRUE), " observations with modern prevalence of zero. \n\tIn the input data file, these are in rows (ignoring header):\n\t\t",
                       paste(rows_in_orig_input[modern.zero], collapse = ", "), "\n\t\tThese will be ROUNDED UP to 0.1 percent."))
    }

    trad.zero <-
        !is.na(data.raw[, cp_trad_col_name]) &
        (round(data.raw[, cp_trad_col_name], 9)  < 0.1)
    if(sum(trad.zero, na.rm = TRUE) > 0) {
        data.raw[, cp_trad_col_name][trad.zero] <- 0.1
        data.raw$rounded.up <- trad.zero | data.raw$rounded.up
        message(paste0("There are ", sum(trad.zero, na.rm = TRUE), " observations with traditional prevalence of zero. \n\tIn the input data file, these are in rows (ignoring header):\n\t\t",
                       paste(rows_in_orig_input[trad.zero], collapse = ", "), "\n\t\tThese will be ROUNDED UP to 0.1 percent."))
    }

    any.zero <-
        !is.na(data.raw[, cp_any_col_name]) &
        (round(data.raw[, cp_any_col_name], 9)  < 0.1)
    if(sum(any.zero, na.rm = TRUE) > 0) {
        data.raw[, cp_any_col_name][any.zero] <- 0.1
        data.raw$rounded.up <- any.zero | data.raw$rounded.up
        message(paste0("There are ", sum(any.zero, na.rm = TRUE), " observations with prevalence (any method) of zero. \n\tIn the input data file, these are in rows (ignoring header):\n\t\t",
                       paste(rows_in_orig_input[any.zero], collapse = ", "), "\n\t\tThese will be ROUNDED UP to 0.1 percent."))
    }

    ## -------** CP Any ^= CP Mod but CP Trad blank

    ## If there are values for CP Any and CP Mod, not equal, but CP
    ## Trad is missing, set CP Trad to CP Any - CP Mod.
    mod.neq.any <- (is.na(data.raw[, cp_trad_col_name]) | is.nan(data.raw[, cp_trad_col_name])) & !is.na(data.raw[, cp_mod_col_name]) & !is.na(data.raw[, cp_any_col_name]) &
        (round(data.raw[, cp_mod_col_name], 9) != round(data.raw[, cp_any_col_name], 9))
    if(sum(mod.neq.any, na.rm = TRUE) > 0) {
        message(paste0("There are ", sum(mod.neq.any, na.rm = TRUE), " observations with traditional prevalence missing but Modern and Any are not. \n\tIn the input data file, these are in rows (ignoring header):\n\t\t",
                       paste(rows_in_orig_input[mod.neq.any], collapse = ", "), "\n\t\tTRADITIONAL prevalences for these observations HAVE BEEN SET TO 'ANY' - 'MODERN' and these observations WILL contribute to estimates of the modern/traditional breakdown."))
        data.raw[mod.neq.any,]$Contraceptive.use.TRADITIONAL <-
            data.raw[mod.neq.any,]$Contraceptive.use.ANY - data.raw[mod.neq.any,]$Contraceptive.use.MODERN
    }

    ##  -------** Blank out Modern

    ## If 'TRADITIONAL' is blank AND 'MODERN' = 'ANY' after rounding, set 'MODERN' to blank as well
    trad.blank <- (is.na(data.raw[, cp_trad_col_name]) | is.nan(data.raw[, cp_trad_col_name])) & !is.na(data.raw[, cp_mod_col_name]) & !is.na(data.raw[, cp_any_col_name]) & (round(data.raw[, cp_mod_col_name], 9) == round(data.raw[, cp_any_col_name], 9))
    if(sum(trad.blank, na.rm = TRUE) > 0) {
        message(paste0("There are ", sum(trad.blank, na.rm = TRUE), " observations with traditional prevalence missing. In the input data file, these are in rows (ignoring header):\n\t\t",
                       paste(rows_in_orig_input[trad.blank], collapse = ", "), "\n\t\tMODERN prevalences for these observations HAVE ALSO BEEN SET TO MISSING and these observations will NOT contribute to estimates of the modern/traditional breakdown."))
        data.raw[trad.blank,]$Contraceptive.use.MODERN <- NA
    }

    ## -------** Clean Mod/Trad Breakdown Obs

    bdown <- !is.na(data.raw[, cp_mod_col_name]) & !is.na(data.raw[, cp_trad_col_name])

    ## Fill in: CP Any = CP Mod + CP Trad
    ## This version requires CP Any to be non-missing for mod/trad breakdown obs
    data.raw[, cp_any_col_name][bdown] <-
                 data.raw[, cp_mod_col_name][bdown] + data.raw[, cp_trad_col_name][bdown]

    ## -------* Process Unmet Values

    data.raw[, unmet_col_name] <- round(as.numeric(data.raw[, unmet_col_name]), 9)

    unmet.zero <- !is.na(data.raw[, unmet_col_name]) & (data.raw[, unmet_col_name] == 0)
    if(sum(unmet.zero, na.rm = TRUE) > 0) {
        data.raw[, unmet_col_name][unmet.zero] <-
            sapply(100 - data.raw[, cp_any_col_name][unmet.zero],
                   function(z) min(0.1, z))
        message(paste0("There are ", sum(unmet.zero, na.rm = TRUE), " observations with unmet need of zero. In the input data file, these are in rows (ignoring header):\n\t\t",
                       paste(rows_in_orig_input[unmet.zero], collapse = ", "), "\n\t\tThese will be ROUNDED UP to 0.1 percent."))
    }

    ## -------* Replace NAs with '""' for CHARACTER columns

    char.cols <- numeric(0)
    for(j in 1:ncol(data.raw)) {
        if(is.character(data.raw[,j])) {
            char.cols <- c(char.cols, j)
            idx <- is.na(data.raw[,j])
            data.raw[idx,j] <- ""
        }
    }
    char.col.names <- paste(colnames(data.raw)[char.cols], collapse = ", ")

    ## -------* Fix Excel re-formats

    ## Move this here
    ## 1. Fix age groups that excel made into dates
    ## change JR, 20140409
    data.raw[, age_range_col_name] <-
        InternalFixRange(gsub(" ", "", data.raw[, age_range_col_name]))

    ## Move this here.
    ## [MCW-2016-03-11-2]: Fix end year
    data.raw[, end_year_col_name] <- InternalFixEndYear(data.raw[, end_year_col_name])

    ## -------* Standard Error Columns

    ## From Niamh's version: gets survey-base SEs (2017-04-19)
    ##Add columns that indicate if SEs are available for modern,trad,unmet ratios
    include.modern.ses<-rep(NA,nrow(data.raw))
    include.trad.ses<-rep(NA,nrow(data.raw))
    include.unmet.ses<-rep(NA,nrow(data.raw))

    ##Indicator if modern ratios ses are available
    if(is.null(data.raw[, se_logr_mod_no_use_col_name]))
    {
        include.modern.ses<-rep(0,length(include.modern.ses))
    }
    if(!is.null(data.raw[, se_logr_mod_no_use_col_name]))
    {
        if(any(is.na(data.raw[, se_logr_mod_no_use_col_name])))
        {
            ## Take care of 'NaN's/'Infs' [MCW-2018-01-02]
            include.modern.ses[which(!is.finite(data.raw[, se_logr_mod_no_use_col_name]))]<-0
            ## include.modern.ses[which(is.na(data.raw[, se_logr_mod_no_use_col_name]))]<-0
            include.modern.ses[which(!is.na(data.raw[, se_logr_mod_no_use_col_name]))]<-1
        }
        if(all(!is.na(data.raw[, se_logr_mod_no_use_col_name])))
        {
            include.modern.ses<-rep(1,length(include.modern.ses))
        }
    }

    ##Indicator if traditional ratios ses are available
    if(is.null(data.raw[, se_logr_trad_no_use_col_name]))
    {
        include.trad.ses<-rep(0,length(include.trad.ses))
    }
    if(!is.null(data.raw[, se_logr_trad_no_use_col_name]))
    {
        if(any(is.na(data.raw[, se_logr_trad_no_use_col_name])))
        {
            ## Take care of 'NaN's/'Infs' [MCW-2018-01-02]
            include.trad.ses[which(!is.finite(data.raw[, se_logr_trad_no_use_col_name]))]<-0
            ## include.trad.ses[which(is.na(data.raw[, se_logr_trad_no_use_col_name]))]<-0
            include.trad.ses[which(!is.na(data.raw[, se_logr_trad_no_use_col_name]))]<-1
        }
        if(all(!is.na(data.raw[, se_logr_trad_no_use_col_name])))
        {
            include.trad.ses<-rep(1,length(include.trad.ses))
        }
    }

    ##Indicator if unmet ratios ses are available
    if(is.null(data.raw[, se_logr_unmet_no_need_col_name]))
    {
        include.unmet.ses<-rep(0,length(include.unmet.ses))
    }
    if(!is.null(data.raw[, se_logr_unmet_no_need_col_name]))
    {
        if(any(is.na(data.raw[, se_logr_unmet_no_need_col_name])))
        {
            ## Take care of 'NaN's/'Infs' [MCW-2018-01-02]
            include.unmet.ses[which(!is.finite(data.raw[, se_logr_unmet_no_need_col_name]))]<-0
            ## include.unmet.ses[which(is.na(data.raw[, se_logr_unmet_no_need_col_name]))]<-0
            include.unmet.ses[which(!is.na(data.raw[, se_logr_unmet_no_need_col_name]))]<-1
        }
        if(all(!is.na(data.raw[, se_logr_unmet_no_need_col_name])))
        {
            include.unmet.ses<-rep(1,length(include.unmet.ses))
        }
    }
    data.raw<-cbind(data.raw,include.trad.ses,include.modern.ses,include.unmet.ses)

    if(ModelFunctionSurveySEs(write.model.fun) && is.null(iso.select)) {
        if(!isTRUE(all(c(sum(include.modern.ses)
                        ,sum(include.trad.ses)
                        ,sum(include.unmet.ses)) > 0))) {
            stop("'write.model.fun' requires survey-based SEs for 'modern', 'traditional', and 'unmet', but none present in the input file.")
        }
    }

    ## -------* Exclusions

    ##details<<
    ## Observations are excluded if column \code{EXCLUDE1isyes} == 1.
    ## Observations are excluded if \code{Note.on.methods} is "Data pertain to methods used since the last pregnancy."
    ## or "Data pertain to past or current use.".
    ## Observations are excluded if total use is missing, unless they are from service statistics.
    ## (one survey in Bhutan with modern only)
    if (is.null(data.raw[, note_on_methods_col_name])) data.raw[, note_on_methods_col_name] <- rep(NA, nrow(data.raw))

    remove_1 <-
        !is.na(data.raw[, exclude_1_is_yes_col_name]) & data.raw[, exclude_1_is_yes_col_name] == 1

    remove_2 <-
        (is.na(data.raw[, cp_any_col_name]) & is.na(data.raw[, unmet_col_name]) &
         !grepl(data.cell.vals$service.statistic, data.raw[, data_series_type_col_name]) &
         !mod.only #<< keep obs if have only CP Modern.
        )

    remove_3 <-
        (!is.na(data.raw[, note_on_methods_col_name]) &
         (grepl(data.cell.vals$data.pertain.to.methods.used.since.the.last.pregnancy, data.raw[, note_on_methods_col_name]) |
          grepl(data.cell.vals$data.pertain.to.past.or.current.use, data.raw[, note_on_methods_col_name])
         )
        )

    remove <- remove_1 | remove_2 | remove_3

    if (any(remove)) {
        data.raw <- data.raw[!remove, ]
        if(print.messages) {
            msg <- "Excluding certain observations:"
            if (any(remove_1))
                msg <- paste0(msg, "\n\t'", exclude_1_is_yes_col_name, "' is '1'",
                              "\n\t\tRows (ignoring header): ", paste0(rows_in_orig_input[remove_1], collapse = ", "))
            if (any(remove_2))
                msg <- paste0(msg, "\n\t'", cp_any_col_name, "' is 'NA' and '", data_series_type_col_name, "' is NOT service statistic",
                              "\n\t\tRows (ignoring header): ", paste0(rows_in_orig_input[remove_2], collapse = ", "))
            if (any(remove_3))
                msg <- paste0(msg, "\n\t'", note_on_methods_col_name, "' is data pertain to methods used since last pregnancy or data pertain to past or current use",
                              "\n\t\tRows (ingoring header): ", paste0(rows_in_orig_input[remove_3], collapse = ", "))
            message(msg)
        }
        rows_in_orig_input <- rows_in_orig_input[!remove]
    }

    ## AFTER fixing all column headings and cells: Make sure only one
    ## observation per catalogue ID, age-group, marital status.
    if(!is.null(data.raw[, catalog_id_col_name])) {
        if(sum(is.na(data.raw[, catalog_id_col_name])) > 0) {
            if(print.warnings) {
                message("Checking for duplicate rows:\n\tThere are missing 'Catalog.ID's in row(s) (ignoring header):\n\t\t",
                        paste0(rows_in_orig_input[is.na(data.raw[, catalog_id_col_name])], collapse = ", "))
            }
        }
        dup.row <-
            as.data.frame(with(data.raw, table(Catalog.ID, Age..range, Population.type)))
        dup.row <- dup.row[dup.row$Freq > 1,]
        if(nrow(dup.row) > 0) {
            if(print.warnings) message("Checking for duplicate rows:\n\tDuplicate Catalog ID*Age..range*Population.type combinations exist in 'data.csv'. NOTE: Nothing was removed but the duplicates have 'Catalog.ID's:\n\t\t",
                                       paste0(dup.row$Catalog.ID, collapse = ", "))
        }
    } else {
        if(print.warnings) message("No 'Catalog.ID' column in ", data.csv, "; input data not checked for duplicates.")
    }

    return(data.preprocessed = data.raw)
}

###----------------------------------------------------------------------------------
### Preprocess classification file
###
PreprocessClassification <-
    function(regioninfo.csv = NULL,
             check.missing = TRUE) {

        ## -------* Set up

        if(is.null(regioninfo.csv)) {
            regioninfo.csv <-
                file.path(find.package("FPEMglobal")
                         ,"data", "Country-and-area-classification.csv"
                          )
        }

        ## -------* Constants

        ## Create column name regexps and cell value regexps
        data.col.names <- InternalRegExpsInputCols()
        data.cell.vals <- InternalRegExpsInputCells()

        ## -------* MAIN BODY

        ## Read csv
        country.info <- read.csv(regioninfo.csv, stringsAsFactors = FALSE)

        ## Make sure columns named properly
        for(i in 1:length(data.col.names$regex)) {
            names(country.info)[grepl(pattern = data.col.names$regex[i], x = names(country.info))] <-
                data.col.names$df.names[i]
        }

        ## Remove Sark because it has no 3-letter code
        if(680 %in% as.numeric(country.info$ISO.code)) {
            country.info <-
                country.info[as.numeric(country.info$ISO.code) != 680,]
            message("'Sark' is in '", regioninfo.csv, "' but will be ignored because it has no 3-letter country code.")
        }

        ## Check for missing values
        if(check.missing) {

            ## Check for missing values
            miss.v <- which(country.info == "" | country.info == NA, arr.ind = TRUE)
            if(sum(miss.v) > 0) {
                stop("Missing values in ", "'", regioninfo.csv, "'", " at row "
                    ,miss.v[1], ", col ", miss.v[2]
                   ," ('", colnames(country.info)[miss.v[2]], "')", ".")
            }
        }

        ## -------* Return

        return(country.info)

    }


###----------------------------------------------------------------------------------
##' Read in alternative file aggregates with some checks.
##'
##' \code{\link{ReadFileAggregates}} checks some column names using \code{\link{PreprocessClassification}}.
##'
##' @param file.aggregates If NULL (default), UNDP aggregates are
##'     constructed. Alternatively, file path of alternative grouping should be
##'     given, e.g. \code{file.aggregates = "data/MDGgroupings.csv")}. Such data
##'     file needs to contain columns \code{iso.country}, \code{groupname} and
##'     \code{iso.group} (which may contain missing values). Each country can
##'     only be included once (can only be part of one grouping).
##' @return
##' @author Mark Wheldon
##' @noRd
ReadFileAggregates <- function(file.aggregates) {

    ## -------* SET UP

    ## -------** Inputs

    agg.info <- PreprocessClassification(file.aggregates, check.missing = FALSE)

    ## -------* MAIN

    ## -------* RETURN
    }


###----------------------------------------------------------------------------------
### Read contraceptive prevalence data
###
ReadDataAll <-
    function(## To use alternative data set, use \code{data.csv =
             ## .../dataCPmodel.csv}, where ... refers to the file path where
             ## file is located.
             data.csv = NULL,
             data.preprocessed = NULL,
             ## R object produced by \code{\link{PreprocessData}}. If
             ## \code{NULL} and data.csv NULL, data set included in package is
             ## used.

             output.dir = NULL,

             ## To use alternative csv file, use \code{regioninfo.csv =
             ## .../Country-and-area-classification.csv}, where ... refers to
             ## the file path where file is located.
             regioninfo.csv = NULL,
             regioninfo.preprocessed = NULL,
             ## R object produced by \code{\link{PreprocessData}}. If
             ## \code{NULL} and regioninfo.csv is NULL, region info included in
             ## package is used.

             ##<< If \code{NULL}, data of all countries/subpopulations in data
             ## set are read in, else only data of countries/subpopulations with
             ## selected ISO code is read in.
             iso.select = NULL,

             ##<< If not \code{NULL}, data is treated as data from subpopulation
             ##of country with the ISO code iso.country.select.
             iso.country.select = NULL,

             ##<< Country name corresponding to
             ##\code{iso.country.select}. Cannot be \code{NULL} if
             ##\code{iso.country.select} is not \code{NULL}.
             name.country.select = NULL,

             ##<< Do first pass run of run with SS data?
             do.SS.run.first.pass = do.SS.run.first.pass,

             ##<< csv file with ISO 3-character and 3-digit country codes, only
             ##used to convert iso.country.select into 3-digit country code if
             ##given in 3-character country code.
             countrycodes.csv = "data/Country-names-and-codes.csv",

             write.model.fun = "WriteModel",

             ##<<If not NULL, summary results are written to this HTML file.
             html.file = NULL,

             ## Count PMA as its own source? (This is now obsolete)
             disagg.RN.PMA = TRUE,

             ## Include countries with no data?
             include.c.no.data = TRUE,

             validation.list = NULL,

             ## Ensure that each country has at least
             ## 'at.random.min.c' data points in training set.
             at.random.min.c = 1,

             ## Should obs in countries with only 1 obs be kept in training for 'at.end' validation?
             at.end.not.1.obs.c = FALSE,

             ## Needed for SE imputations
             data.global = NULL,
             do.country.specific.run = FALSE

             ){

        ## -------* Set-up

        ## -------** Function arguments

        if (!is.null(iso.country.select) & is.null(name.country.select)) # change JR, 20140409
            stop("name.country.select cannot be NULL if iso.country.select is non-NULL.")

        ## Create new html file
        if (!is.null(html.file)){
            cat("", file = html.file, append = F)
            print(paste("Summary stats written to", html.file))
        }

        if (is.null(data.csv) && is.null(data.preprocessed)){
            data.csv <- file.path(find.package("FPEMglobal"), "data", "dataCPmodel.csv")
            cat(paste("CP data read from", data.csv), "\n")
        }
        if (is.null(regioninfo.csv) && is.null(regioninfo.preprocessed)){
            regioninfo.csv <-
                file.path(find.package("FPEMglobal"), "data", "Country-and-area-classification.csv")
        }
        cat(paste("Country/region info read from", regioninfo.csv), "\n")

        ## -------** Constants

        ## Create column name regexps and cell value regexps
        data.col.names <- InternalRegExpsInputCols()
        data.cell.vals <- InternalRegExpsInputCells()

        ## -------* Read in csv files, process

        ## -------** Input Data

        if(is.null(data.preprocessed)) {
        data.unsorted <-
            PreprocessData(data.csv = data.csv, iso.select = iso.select)
        } else data.unsorted <- data.preprocessed

        ## -------** Region info

        if(is.null(regioninfo.preprocessed)) {
            country.info.temp <- PreprocessClassification(regioninfo.csv)
            } else country.info.temp <- regioninfo.preprocessed

        ## -------*** Country names

        ## [MCW-2016-12-28-1] :: Replace country names in 'data.csv' with names in the
        ## classifications file. This protects against problems caused when the same
        ## country (as ID'd by ISO.Code) has different variants of its name (e.g.,
        ## 'Bolivia' and 'Bolivia (Plurinational State of)').
        if("Country" %in% colnames(data.unsorted)) {
            country.name.col <- which(colnames(data.unsorted) == "Country")
            data.unsorted <-
                merge(data.unsorted[,-country.name.col]
                     ,country.info.temp[,c("ISO.code"
                                          ,"Country.or.area"
                                           )]
                     ,all.x = TRUE, all.y = FALSE, by = "ISO.code", sort = FALSE
                      )
            colnames(data.unsorted)[colnames(data.unsorted) == "Country.or.area"] <- "Country"
            message("\nCountry names in 'data.csv' have been replaced by those in the country classifications file, matched by ISO code.")
        }

        ## -------*** Service statistics first pass?

        if (do.SS.run.first.pass) {
            remove <- grepl(data.cell.vals$service.statistic, data.unsorted$Data.series.type)
            if (any(remove)) {
                data.unsorted <- data.unsorted[!remove, ]
                cat(paste0(sum(remove), " service statistics data observations removed.\n"))
            }
        }

        ## -------*** Check that all ISO codes are in country classifications file

        ## [MCW-2017-06-23-2] :: Check that all countries in the input data are in the classifications file.
        not.in.cinfo <-
            unique(data.unsorted$ISO.code)[!(unique(data.unsorted$ISO.code) %in% country.info.temp$ISO.code)]
        if(length(not.in.cinfo > 0)) stop("Country with ISO code ", not.in.cinfo, " is in the input data file but not in the country classifications file. Fix and re-run")

        ## -------* Make Data Frame to Contain Data

        ## -------** Proportions

        ## For some obs there is no break-down of total into modern
        ## and trad CP.  Re-order the rows such that all modern obs
        ## are first, followed by obs with total only (makes things
        ## easier when constructing the data set for BUGS). Put obs
        ## with modern only right at the end. These are identifiable
        ## by a non-missing value for modern and a missing value for
        ## tot.

        J <- nrow(data.unsorted)

        props.modern.j <- round(data.unsorted$Contraceptive.use.MODERN/100, 9)
        props.tot.j <- round(data.unsorted$Contraceptive.use.ANY/100, 9)
        props.trad.j <- round(data.unsorted$Contraceptive.use.TRADITIONAL/100, 9)

        breakdown.ord <- !is.na(props.modern.j) & !is.na(props.trad.j)
        tot.ord <- !breakdown.ord & !is.na(props.tot.j)
        last.ord <- !breakdown.ord & !tot.ord
        order <- c(seq(1, J)[breakdown.ord]
                  ,seq(1, J)[tot.ord]
                  ,seq(1, J)[last.ord])

        props.modern.j <- props.modern.j[order]

        props.tot.j <- round(data.unsorted$Contraceptive.use.ANY[order]/100, 9)

        props.trad.j <- round(data.unsorted$Contraceptive.use.TRADITIONAL[order]/100, 9)

        ## -------** Rounding

        rounded.up <- data.unsorted$rounded.up[order]

        ## -------** CP any less than 1%

        less.than.1.pc <- props.tot.j < 0.01

        ## -------** Years

        years.j <- ((data.unsorted$Start.year+data.unsorted$End.year)/2)[order]
        start.j = data.unsorted$Start.year[order] #Change NC, 20170201
        end.j = data.unsorted$End.year[order] #Change NC, 20170201

        ## -------** Sort whole data frame

        data <- data.unsorted[order,]

        ## -------** Country names

        if (is.null(iso.country.select)) { # change JR, 20140404
            name.j <- data$Country
            name.unsorted <- data.unsorted$Country
        } else {
            name.j <- data$New.population
            name.unsorted <- data.unsorted$New.population
        }

        ## -------** Age

        ##details<< Age categorization:
        ##details<< - If group starts at 13-17 and ends at 47-51: base line (0).
        ##details<< - If group start at 13-17 but ends after 51: negative bias ("neg").
        ##details<< - "See notes" is positive.
        ##details<< - Other groups get flagged with "?" (other).
        Age <- data$Age..range

        has_age_category_bias_data <- ("age.cat.bias" %in% colnames(data)) && any(data$age.cat.bias != "")

        if (has_age_category_bias_data) {
            if(!isTRUE(all(as.character(data$age.cat.bias) %in% c("+", "-", "?", 0), na.rm = TRUE))) {
                warning("'age.cat.bias' is in input data but contains illegal entries. The only entries allowed are: 'as.character(data$age.cat.bias)) %in% c('+', '-', '?', 0), na.rm = TRUE)' This will probably cause the run to fail so fix this. You can delete 'age.cat.bias' to revert to default behaviour for 15--49 based on 'Age..range' column.")
            }
            age.cat.j <- data$age.cat.bias
        } else {
            message("Age category bias derived from 'Age..range' column. NOTE: This is not yet implemented for age groups other than 15--49.")
            age.cat.j <- rep(NA,J)
            old.warn <- getOption("warn")
            options(warn=-1) # next part will give warnings, suppress those
            for (j in 1:J){
                split <- as.integer(strsplit(Age[j], split = "")[[1]]) # gives a warning
                age.cat.j[j] <-
                    ifelse(length(split)!=5 ,"?",
                    ifelse(split[1]==1 & split[2]<8 & split[2]>2 & ((split[4]==4 & split[5]>6)|(split[4]==5 & split[5]<2)) , 0,
                    ifelse(split[1]==1 & split[2]<8 & split[2]>2 & split[4]==5 & split[5]>1 , "-","?")))
            }
            options(warn = old.warn) # back to original setting
            age.cat.j <- ifelse(Age == "See notes", "+", age.cat.j)
            if (!is.null(html.file)){
                print(xtable::xtable(xtabs(~Age+age.cat.j), digits = c(0,0,0,0,0), type = "html",
                             caption = "Age"), type="html", file = html.file, append = T)
                print(xtable::xtable(table(age.cat.j), digits = c(0,0), type = "html",
                             caption = "Age"), type="html", file = html.file, append = T)
                                # print(xtabs(~Age+age.cat.j))
            }
        }

        ## -------** Data Series Type

        ## get other columns in shape so that they're meaningful
        ## then have separate function to get the input for bugs

        source <- data$Data.series.type

        ##[MCW-2016-03-24-4] check if need to disaggregate repeated national and PMA
        if (disagg.RN.PMA) {
            source.j <- ifelse(is.element(source, c("DHS", "DHS microdata")), "DHS",
                        ifelse(is.element(source, c("MICS", "MICS1", "MICS2", "MICS3", "MICS4"
                                                   ,"MICS5", "MICS microdata"
                                                    )), "MICS",
                        ifelse(is.element(source, c("National survey", "National Survey")), "NS",
                        ifelse(source %in% c("Repeated-natl", "Repeated national"
                                            ,"Repeated national survey"), "RN",
                        ifelse(source %in% c("PMA", "PMA microdata", "PMA Microdata", "PMA micro-data", "PMA Micro-data", "PMA Micro-Data", "PMA micro data", "PMA Micro data", "PMA Micro Data"), "PMA",
                        ifelse(grepl("Service statistic", source), "SS",
                               "Other"))))))
        } else {
            source.j <- ifelse(is.element(source, c("DHS", "DHS microdata")), "DHS",
                        ifelse(is.element(source, c("MICS", "MICS1", "MICS2", "MICS3", "MICS4"
                                                   ,"MICS5" # change [MCW-2016-02-22-1]: added MICS5
                                                    )), "MICS", # change JR, 20140310: added MICS and MICS4
                        ifelse(source == "National survey", "NS",
                        ifelse(grepl("Service statistic", source), "SS", # change JR, 20131120 # change JR, 20140418
                               "Other"))))
        }

        RN.rows <- source.j == "RN"
        if(any(RN.rows)) {
            warning("Data source type 'RN' ('repeated national survey') is depracated. Data source in rows (ignoring header) "
                   ,paste(which(RN.rows), collapse = ", ")
                   ," has been set to 'NS' ('national survey').\n**PLEASE MAKE SURE THIS IS WHAT YOU WANT**")
        }
        source.j[RN.rows] <- "NS"

        other.rows <- source.j == "Other"
        if(any(other.rows)) {
            message("\nThe following data sources have been classified as 'Other': "
                   ,paste(unique(source[other.rows]), collapse = ", "))
            }

        if (!is.null(html.file)){
            print(xtable::xtable(xtabs( ~ source + source.j), digits = c(0,0,0,0,0), type = "html",
                         caption = "Source"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(source.j), digits = c(0,0), type = "html",
                         caption = "Source"), type="html", file = html.file, append = T)
        }
        geo.j <- data$GEO.biases..unknown.direction.
        if (all(is.na(geo.j))) # change JR, 20131121: convert NA's to ""
            geo.j[is.na(geo.j)] <- ""
        geo.short.j <- ifelse(geo.j != "", 1, 0)
        if (!is.null(html.file)){
            print(xtable::xtable(table(geo.j), digits = c(0,0), type = "html",
                         caption = "Geo"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(geo.j!=""), digits = c(0,0), type = "html",
                         caption = "Geo"), type="html", file = html.file, append = T)
        }

        ## -------** Population Type

        poptype.j <- data$Population.type
        poptype.short.j <- ifelse(is.element(poptype.j, c("BS", "HW")), "BSHW", paste(poptype.j))
        if (!is.null(html.file)){
            print(xtable::xtable(xtabs(~ poptype.j + poptype.short.j), digits = rep(0,6), type = "html",
                         caption = "Pop type"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(poptype.short.j), digits = c(0,0), type = "html",
                         caption = "Pop type"), type="html", file = html.file, append = T)
        }

        ## -------** Biases

        ## -------*** Folk Bias

        folkbias.j <- data$Folk.method.positive.bias
        if (all(is.na(folkbias.j))) # change JR, 20131121: convert NA's to ""
            folkbias.j[is.na(folkbias.j)] <- ""

        ## -------*** Positive Bias

        posbias.j <- data$Non.pregnant.and.other.positive.biases
        if (all(is.na(posbias.j))) # change JR, 20131121: convert NA's to ""
            posbias.j[is.na(posbias.j)] <- ""

        ## -------*** Modern Bias

        mod.bias.j <- data$Modern.method.bias
        if (all(is.na(mod.bias.j))) # change JR, 20131121: convert NA's to ""
            mod.bias.j[is.na(mod.bias.j)] <- ""

        ## -------*** Probing Q's Bias

        probe.bias.j <- data$Absence.of.probing.questions.bias
        if (all(is.na(probe.bias.j))) # change JR, 20131121: convert NA's to ""
            probe.bias.j[is.na(probe.bias.j)] <- ""

        ## -------** Print

        if (!is.null(html.file)){
            print(xtable::xtable(table(folkbias.j), digits = rep(0,2), type = "html",
                         caption = "Folk"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(folkbias.j!=""), digits = rep(0,2), type = "html",
                         caption = "Folk"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(posbias.j), digits = rep(0,2), type = "html",
                         caption = "+bias"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(posbias.j!=""), digits = rep(0,2), type = "html",
                         caption = "+bias"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(mod.bias.j), digits = rep(0,2), type = "html",
                         caption = "Bias modern"), type="html", file = html.file, append = T)
            print(xtable::xtable(table(probe.bias.j), digits = rep(0,2), type = "html",
                         caption = "Bias absence of probing questions"), type="html", file = html.file, append = T)
        }

        ## -------** Unmet Need

        source.unmet.j <- ifelse(source.j=="DHS", "DHS", "Other")
        props.unmet.j <- round(data$Unmet/100, 9)
        if (!is.null(html.file)){
            print(xtable::xtable(table(source.unmet.j[!is.na(props.unmet.j)]), digits = rep(0,2), type = "html",
                         caption = "Source Unmet need"), type="html", file = html.file, append = T)
        }

        ## -------** Survey-based SEs (copied from Niamh's 2017-04-19)

        ##SEs for log ratios #Change NC
        if(!is.null(data.unsorted$SE.logR.trad.nouse))
            se.logR.trad.nouse<-data.unsorted$SE.logR.trad.nouse[order] #Change NC, 20161219
        if(is.null(data.unsorted$SE.logR.trad.nouse))
            se.logR.trad.nouse<-rep(NA,J)

        if(!is.null(data.unsorted$SE.logR.modern.nouse))
            se.logR.modern.nouse<-data.unsorted$SE.logR.modern.nouse[order] #Change NC, 20161219
        if(is.null(data.unsorted$SE.logR.modern.nouse))
            se.logR.modern.nouse<-rep(NA,J)

        if(!is.null(data.unsorted$SE.logR.unmet.noneed))
            se.logR.unmet.noneed<-data.unsorted$SE.logR.unmet.noneed[order] #Change NC, 20161219
        if(is.null(data.unsorted$SE.logR.unmet.noneed))
            se.logR.unmet.noneed<-rep(NA,J)

        ##Indicator if SE is included or not, Change NC
        include.trad.ses<-data.unsorted$include.trad.ses[order] #Change NC, 20161219
        include.modern.ses<-data.unsorted$include.modern.ses[order] #Change NC, 20161219
        include.unmet.ses<-data.unsorted$include.unmet.ses[order] #Change NC, 20161219

        ## -------** Put in Data Frame

        data <- data.frame(iso.j = data$ISO.code, name.j, # change JR, 20140404
                           years.j, props.tot.j, props.modern.j, props.trad.j,
                     start.j, end.j, #Change NC, 20170102
                           age.cat.j, source.j, poptype.j, geo.j,
                           posbias.j, folkbias.j, mod.bias.j, probe.bias.j,
                           props.unmet.j, source.unmet.j,
                           rounded.up,
                           less.than.1.pc,
                          se.logR.trad.nouse,
                          se.logR.modern.nouse,
                           se.logR.unmet.noneed,
                          include.trad.ses,
                          include.modern.ses,
                           include.unmet.ses,
                     stringsAsFactors=FALSE)

        ## -------* Validation Run?

        if(!is.null(validation.list)) {

            ## -------** Draw Training Set

            ## Move generation of 'getj.training.k' via 'GetTraining()' here from
            ## 'GetBugsData()' so that 'at.random.no.data' validation exercise can
            ## be handled.
            ##details<< If \code{!is.null(validation.list)}, \code{getj.training.k} is constructed
            ## using \code{\link{GetTraining}}
            if (validation.list$generate.new.set){
                getj.training.k <- GetTraining(data, winbugs.data = NULL,
                                               validation.list = validation.list,
                                               country.info = country.info.temp,
                                               seed = validation.list$seed
                                              ,at.random.min.c = at.random.min.c
                                               ,at.end.not.1.obs.c = at.end.not.1.obs.c)
                if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
                    save(getj.training.k, file = file.path(output.dir, "getj.training.k.orig.rda"))
                } else {
                    save(getj.training.k, file = file.path(output.dir, "getj.training.k.rda")) # change JR, 20140418
                }
            } else { # not used!
                load(file = file.path(output.dir, "getj.training.k.rda")) ## change JR, 20140418
            }
        } else {
            getj.training.k <- 1:J
        }

        ## -------** Training and Test Set data

        if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
            data.test <- data[sort(setdiff(1:J, getj.training.k)),]
            data <- data[sort(getj.training.k),]
            write(unique(as.character(data.test$name.j)), file = file.path(output.dir, "test-set-countries.txt"))
            message("\nTest set countries written to ", file.path(output.dir, "test-set-countries.txt"))
        }

        ## -------* Make Data Frame for Country Info

        ## -------** Countries with Data

        ## Takes countries in the input data file and merges on regions,
        ## sub-regions, etc. from the classifications file.
        country.info <- unique(data.frame(data$name.j, # change JR, 20140404
                                          gsub(" ", "", data$iso.j), # change JR, 20131104
                                          stringsAsFactors=FALSE)
                               )
        names(country.info) <- c("name.c", "iso.c")
        country.info <- merge(country.info
                             ,country.info.temp[,c("ISO.code", "Country..letter.code")]
                             ,by.x = "iso.c", by.y = "ISO.code"
                             ,all.x = TRUE, sort = FALSE
                              )
        names(country.info)[3] <- "code.c"

        ## Keep only training set
        idx <- country.info$iso.c %in% data$iso.j
        country.info <- country.info[idx,]

        ## name.short.c <- InternalMakeCountryNamesShort(country.info$name.c)
        ## note: when using table, the output is in alphabetical order, thus China and China, Hong Kong are swapped
        ## swap back!
        ## sum(names(table(data$Country)[name.c])!=  paste(name.c))
        ## sum((table(data$Country)[name.c])!=  table(data$Country))

        ## 'N.c' is the number of observations for this country.
        N.c <- table(data$iso.j)[country.info$iso.c]

        ## Region info. Taken from the 'regioninfo.csv' file.
        if (is.null(iso.country.select)) { # change JR, 20140404
            reg.country.info.c <-
                InternalGetRegionInfoForCountry(iso.c = country.info$iso.c,
                                                country.info = country.info.temp, #Use '.temp' because 'country.info' has had training set countries removed
                                                countrycodes.csv = countrycodes.csv
                                               ,write.model.fun = write.model.fun)
        } else {
            reg.country.info.c <-
                InternalGetRegionInfoForCountry(iso.c = iso.country.select,
                                                country.info = country.info.temp,#Use '.temp' because 'country.info' has had training set countries removed
                                                countrycodes.csv = countrycodes.csv
                                               ,write.model.fun = write.model.fun)
        }

        ## Check to see if sexual activity column included
        if("sex.ac.unm.c" %in% colnames(reg.country.info.c)) {
            reg.country.cols.select <-
                c("namereg.c", "reg.c", "namesubreg.c", "subreg.c"
                ,"ssa.c", "dev.c", "fp2020.c"
                 ,"name.sex.ac.unm.c", "sex.ac.unm.c"
                 ,"name.reg.in.sex.ac.unm.c", "reg.in.sex.ac.unm.c"
                 ,"name.subreg.in.sex.ac.unm.c", "subreg.in.sex.ac.unm.c"
                 ,"name.reg.in.sex.ac.unm.SA1sub.c", "reg.in.sex.ac.unm.SA1sub.c"
                 ,"name.subreg.in.sex.ac.unm.SA1sub.c", "subreg.in.sex.ac.unm.SA1sub.c"
                  )
        } else {
            reg.country.cols.select <-
                c("namereg.c", "reg.c", "namesubreg.c", "subreg.c"
                ,"ssa.c", "dev.c", "fp2020.c")
        }
        country.info <- data.frame(country.info, N.c = as.numeric(N.c),
                                   reg.country.info.c[, reg.country.cols.select],
                                   stringsAsFactors=FALSE)

        if (!is.null(iso.country.select)) # change JR, 20140404
            country.info <- data.frame(country.info,
                                       iso.country.select = iso.country.select,
                                       name.country.select = name.country.select, # change JR, 20140409
                                       stringsAsFactors=FALSE)
        if (!is.null(html.file)){
            print(xtable::xtable(country.info, type = "html",
                         caption = "Country info"), type="html", file = html.file, append = T)
        }
        region.info <- InternalGetRegionInfoGeneral(country.info = country.info)

        ## -------** Countries without data

        ## [MCW-2016-08-23-1] Add countries with no data. Countries with no
        ## data have separate data frames so they can be treated differently in
        ## the JAGS model.

        if(include.c.no.data && !isTRUE(validation.list$at.random.no.data) && !isTRUE(validation.list$leave.iso.out)) {
            iso.no.data <- !(country.info.temp$ISO.code %in% country.info$iso.c)
                                #Use '.temp' because 'country.info' has had
                                #training set countries removed
        } else if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
            iso.no.data <- country.info.temp$ISO.code %in% data.test$iso.j
                                #Use '.temp' because 'country.info' has had
                                #training set countries removed
            } else iso.no.data <- 0
        if(isTRUE(sum(iso.no.data) > 0)) {

            ## -------*** Deal with 'iso.country.select'

            if(!is.null(iso.country.select)) {
                iso.no.data.select <-
                    iso.country.select[iso.country.select %in% country.info.temp$ISO.code[iso.no.data]]
            }
            if(!is.null(name.country.select)) {
                name.no.data.select <-
                    name.country.select[name.country.select %in% country.info.temp$Country..letter.code[iso.no.data]]
            }

            ## -------*** Country.info.no.data

            country.info.no.data <-
                data.frame(name.c = country.info.temp[iso.no.data, "Country.or.area"]
                          ,iso.c = as.character(country.info.temp[iso.no.data, "ISO.code"]) #to match country.info
                          ,code.c = country.info.temp[iso.no.data, "Country..letter.code"] #Used in the parameter plots.
                          ,N.c = 0
                           ,stringsAsFactors = FALSE
                           )
            names(country.info.no.data) <- c("name.c", "iso.c", "code.c", "N.c")

            if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
            ## Need to fill in column 'N.c' in 'country.info.no.data'. If just
            ## estimating for countries with actually no data this is 0. In this
            ## case, need it to contain actual number of obs.
                country.info.no.data$N.c <- table(data.test$name.j)[country.info.no.data$name.c]
                }

            ## -------*** Region / Subregion Info

            if(is.null(iso.country.select)) {
                reg.country.info.c.no.data <-
                    InternalGetRegionInfoForCountry(iso.c = country.info.temp$ISO.code[iso.no.data], #make sure it's numeric
                                                    country.info = country.info.temp,#Use '.temp' because 'country.info' has had training set countries removed
                                                    countrycodes.csv = countrycodes.csv
                                                    #,no.data = TRUE #numeric codes for (sub)regions not created
                                                   ,write.model.fun = write.model.fun)
            } else {
                reg.country.info.c.no.data <-
                    InternalGetRegionInfoForCountry(iso.c = iso.no.data.select, #make sure it's numeric
                                                    country.info = country.info.temp,#Use '.temp' because 'country.info' has had training set countries removed
                                                    countrycodes.csv = countrycodes.csv
                                                    #,no.data = TRUE #numeric codes for (sub)regions not created
                                                   ,write.model.fun = write.model.fun)
            }

            ## -------*** Merge Country and Region Info

            if(is.null(iso.country.select)) {
            country.info.no.data <-
                data.frame(country.info.no.data
                          ,reg.country.info.c.no.data[,reg.country.cols.select]
                          ,stringsAsFactors = FALSE)
            } else {
                country.info.no.data <-
                    data.frame(country.info.no.data
                              ,iso.country.select = iso.no.data.select
                               ,name.country.select = name.no.data.select
                              ,reg.country.info.c.no.data[,reg.country.cols.select]
                              ,stringsAsFactors = FALSE)
            }

            ## -------*** Fix Numeric Codes for Clusters

            ## 'country.info' and 'country.info.no.data' contain character
            ## variables with the names of clusters ((i.e., (sub)regions, sexual
            ## activity groups, etc) /and/ corresponding numeric codes. Creating
            ## them separately means that different codes might be used for the
            ## same cluster. The next part makes sure 'country.info' and
            ## 'country.info.no.data' use the same numeric codes for all
            ## clusters.

            info.fixed <- InternalFixNumericClusterCodes(country.info, country.info.no.data)
            country.info <- info.fixed$country.info
            country.info.no.data <- info.fixed$country.info.no.data

            ## [MCW-2016-09-07-4] :: 'region.info' and 'region.info.no.data'
            ## should have all the regions and subregions.
            region.info.no.data <-
                InternalGetRegionInfoGeneral(country.info = rbind(country.info, country.info.no.data))
            region.info <- region.info.no.data

            ## -------*** Index for Re-Ordering

            ## [MCW-2016-08-26-9] Create an index to re-order countries as in
            ## 'regioninfo.csv' when countries with no data are estimated. This
            ## is used, for example, in making the plots.
            ## [MCW-2016-09-07-5] :: USE ISO CODES to create re-ordering index.
            all.c <- unique(c(country.info$iso.c, country.info.no.data$iso.c))
            ## Make sure only countries in either 'country.info', or
            ## 'country.info.no.data' are referred to. This has an effect if
            ## doing validation exercise 'at.random.no.data'.
            ISO.all.orig.order <- as.character(country.info.temp$ISO.code)
            ISO.all.orig.order <- ISO.all.orig.order[ISO.all.orig.order %in% all.c]
            input.order.c <- sapply(ISO.all.orig.order, function(w) which(w == all.c))

        } else { ## ends part for no countries with no data

            ## -------** if !no.data

            country.info.no.data <- NULL
            region.info.no.data <- NULL
            input.order.c <- 1:nrow(country.info)
        }

        ## -------* SE Imputations

        if(ModelFunctionSurveySEs(write.model.fun)) {
            ## Do this here so 'at.random.no.data' and 'leave.iso.out' validations can be accommodated.
            ##Info on SEs #Change NC, 20161218
            se.info.j<-ImputeSE(data=data,country.info=country.info,data.global=data.global,do.country.specific.run=do.country.specific.run)
            if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
                se.info.j.test <-
                    ImputeSE(data=data.test,country.info=country.info.no.data,data.global=NULL,do.country.specific.run=FALSE)
                ## Possible that no test-set obs will have SEs, so set
                ## them to the imputed values from the training set.
                med.max.se.names <- c("med.max.trad.se", "med.max.modern.se", "med.max.unmet.se")
                logR.se.names <- c("se.logR.trad.impute", "se.logR.modern.impute", "se.logR.unmet.impute")
                for(i in seq_along(med.max.se.names)) {
                    if(is.na(se.info.j.test[[med.max.se.names[i]]])) {
                        se.info.j.test[[med.max.se.names[i]]] <- se.info.j[[med.max.se.names[i]]]
                    }
                }
                for(i in seq_along(logR.se.names)) {
                    ## Set to the imputed value
                    na.idx <- is.na(se.info.j.test[[logR.se.names[i]]])
                    if(any(na.idx)) {
                        se.info.j.test[[logR.se.names[i]]][na.idx] <-
                            rep(se.info.j.test[[med.max.se.names[i]]], sum(na.idx))
                    }
                }
            }
        } else {
            se.info.j <- NULL
            if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
                se.info.j.test <- NULL
            }
        }

        ## -------* Special List for Validation Ex. 'at.random.no.data'

        if(isTRUE(validation.list$at.random.no.data) || isTRUE(validation.list$leave.iso.out)) {
            validation.at.random.no.data <-
                list(getj.training.k.orig = getj.training.k #original version
                    ,data.test = data.test
                     ,se.info.j.test = se.info.j.test
                     )

            ## Need to set getj.training.k to all rows of the triaining set.
            getj.training.k <- 1:nrow(data)
        } else {
            validation.at.random.no.data <- NULL
        }

        ## -------* Finish

        ##value<<
        return(list(data = data,##<< Object of class \code{data}, data frame, described here in Details
                    country.info = country.info,##<< Object of class \code{country.info}
                    region.info = region.info,##<< Object of class \code{region.info}
                    country.info.no.data = country.info.no.data,
                    region.info.no.data = region.info.no.data
                     ,se.info.j = se.info.j
                                        #MCW (2017-02-07): Forgotten why I need
                                        #to keep that... plots?
                    ## [MCW-2016-08-26-10] Return the re-ordering vector.
                   ,input.order.c = input.order.c
                   ,getj.training.k = getj.training.k
                    ,validation.at.random.no.data = validation.at.random.no.data
                    ))
    }
###----------------------------------------------------------------------------------
InternalFixRange <- function(# Fix range character vector that got converted to dates in Excel
  ### Replaces month names with numbers
  x
) {
  x <- as.character(x)
  from <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
  to <- as.character(1:12)
  sub.table <- data.frame(from = from, to = to, stringsAsFactors = FALSE)
  for (i in 1:nrow(sub.table))
    x <- gsub(sub.table$from[i], sub.table$to[i], x)
  return(x)
}
###----------------------------------------------------------------------------------
### Regular expressions for column names in input data files.
###
### Put them in a function in this file so that they are standardized across the
### package; any other function that needs them can call this and get the same
### regexps. (Another option is to consider them as 'internal data' and put them
### in 'R/sysdata.rda'.)
###
InternalRegExpsInputCols <- function(write.model.fun = NULL) {
    ##
    ##  General Notes:
    ##
    ##    Regexps are in alphabetical order.
    ##
    ##    The regexps are case insensitive and any number of any kinds of
    ##    whitespace character is allowed between words (e.g., space, tab,
    ##    newline) as well as a full stop '.'. No delimiter is also
    ##    matched. Beginning and end of word markers are included. E.g., 'Age
    ##    range', 'Age.range', 'age range', '. age..range ', 'AgeRange' all
    ##    match the regexp for 'age.range', but 'xage.range' and 'age.range1' do
    ##    not.
    ##
    ##  Regexp notes:
    ##
    ##    '?i' makes search case INsensitive
    ##
    ##    [:space:]' matches tab, newline, vertical tab, form feed, carriage
    ##    return, space and possibly other locale-dependent characters.
    ##

    ## -------* Subroutines

    re.col <- function(txt) {
        txt <- unlist(strsplit(txt, "\\."))
        txt <- txt[!txt==""] #if there are any multiple spaces, remove them
        txt <- paste(txt, collapse = "[.[:space:]]*")
        paste0("^[.[:space:]]*", "(?i:", txt, ")", "[.[:space:]]*$")
    }

    ## -------* MAIN BODY

    ## -------** Input data columns

    ## Vector of strings that will be turned into regular expressions, and will
    ## be the names of the elements of the output list. These regexps are used
    ## to search in the colnames of the input csv.
    list.names.input.cols <- c("age.range"  #becomes 'Age..range' after 'make.names()'.
                              ## ,"age.range.original"
                              ,"catalog.id"
                              ,"contraceptive.use.any"
                              ,"contraceptive.use.modern"
                              ,"contraceptive.use.traditional"
                              ,"country"
                              ,"country.letter.code"
                              ,"country.or.area"
                              ,"data.series.type"
                              ,"developed.region"
                              ,"end.year"
                              ,"exclude.1.is.yes"
                              ,"female.sterilization"
                              ,"folk.method.positive.bias"
                              ,"fp2020.country"
                              ,"geo.biases.unknown.direction"
                              ,"iso.code"
                              ,"least.developed.country"
                              ,"least.developed.regions"
                              ,"less.developed.regions..excluding.least.developed.countries"
                              ,"major.area"
                              ,"major.area.code"
                              ,"modern.method.bias"
                              ,"negative.bias"
                              ,"new.population"
                              ,"non.pregnant.and.other.positive.biases"
                              ,"note.on.country"
                              ,"note.on.methods"
                              ,"note.on.population"
                              ,"population.type"
                             ,"absence.of.probing.questions.bias.1*"
                              ,"region"
                              ,"region.code"
                              ,"SE.logR.modern.nouse"
                              ,"SE.logR.trad.nouse"
                              ,"SE.logR.unmet.noneed"
                              ,"sexual.activity.among.unmarried"
                              ,"sexual.activity.group"
                              ,"source.name"
                              ,"sub.saharan..africa"
                              ,"start.year"
                              ,"total.population.of.less.than.90.000.by.mid.2015"
                              ,"unmet"
                               ## For alternative aggregates (e.g., used by
                               ## 'GetAggregates()' when 'file.aggregates'
                               ## supplied).
                              ,"iso.country"
                              ,"groupname"
                              ,"iso.group"
                               ,"in.union"
                               )

    ## Vector of the names of the data frame columns that are created after reading in
    ## data files. These are a legacy of the specific way in which input file
    ## columns had to be written.
    ## MUST BE IN SAME ORDER as 'list.names.input.cols'!
    df.names.input.cols <- c("Age..range"
                            ## ,"AgeRangeOriginal"
                            ,"Catalog.ID"
                            ,"Contraceptive.use.ANY"
                            ,"Contraceptive.use.MODERN"
                            ,"Contraceptive.use.TRADITIONAL"
                            ,"Country"
                            ,"Country..letter.code"
                            ,"Country.or.area"
                            ,"Data.series.type"
                            ,"Developed..region"
                            ,"End.year"
                            ,"EXCLUDE1isyes"
                            ,"Female.sterilization"
                            ,"Folk.method.positive.bias"
                            ,"FP2020.country"
                            ,"GEO.biases..unknown.direction."
                            ,"ISO.code"
                            ,"Least.developed.country"
                            ,"Least.Developed.Regions"
                            ,"Less.developed.regions..excluding.least.developed.countries"
                            ,"Major.area"
                            ,"Major.area.Code"
                            ,"Modern.method.bias"
                            ,"Negative.bias"
                            ,"New.population"
                            ,"Non.pregnant.and.other.positive.biases"
                            ,"Note.on.country"
                            ,"Note.on.methods"
                            ,"Note.on.population"
                            ,"Population.type"
                             ,"Absence.of.probing.questions.bias"
                            ,"Region"
                            ,"Region.Code"
                              ,"SE.logR.modern.nouse"
                              ,"SE.logR.trad.nouse"
                              ,"SE.logR.unmet.noneed"
                            ,"Sexual.activity.among.unmarried"
                            ,"Sexual.activity.group"
                            ,"Source.name"
                            ,"Sub.Saharan..Africa"
                            ,"Start.year"
                            ,"Total.population.of.less.than.90.000.by.mid.2015"
                            ,"Unmet"
                               ## For alternative aggregates (e.g., used by
                               ## 'GetAggregates()' when 'file.aggregates'
                               ## supplied).
                              ,"iso.country"
                              ,"groupname"
                            ,"iso.group"
                            ,"In.union")
    ## Check that names is same length as list
    stopifnot(all.equal(length(df.names.input.cols)
                       ,length(list.names.input.cols)
                        ))

    ## -------** Create regular expressions

    out.input.cols <- lapply(list.names.input.cols, "re.col")

    ## -------** Name the lists

    ## Names are all lower case with multiple interword spaces replaced with
    ## single spaces. 'make.names()' is then applied, followed by 'tolower()'.

    names(out.input.cols) <- tolower(make.names(list.names.input.cols))

    ## -------** Which columns are REQUIRED?

    name.required <-
        c("Catalog.ID"
         ,"ISO.code"
         ,"Country"
         ,"EXCLUDE1isyes"
         ,"Start.year"
         ,"End.year"
         ,"Age..range"
         ,"Population.type"
         ,"Contraceptive.use.ANY"
         ,"Contraceptive.use.MODERN"
         ,"Contraceptive.use.TRADITIONAL"
         ,"Note.on.country"
         ,"Note.on.population"
         ,"Note.on.methods"
         ,"GEO.biases..unknown.direction."
         ,"Non.pregnant.and.other.positive.biases"
         ,"Negative.bias"
         ,"Modern.method.bias"
         ,"Folk.method.positive.bias"
         ,"Unmet"
         ,"Data.series.type"
         ,"Source.name"
         ,"Absence.of.probing.questions.bias"
         ,"In.union")
    if (!is.null(write.model.fun) &&
        ModelFunctionRateModel(write.model.fun))
        name.required <- c(name.required
                          ,"SE.logR.modern.nouse"
                          ,"SE.logR.trad.nouse"
                          ,"SE.logR.unmet.noneed")
    required <- which(df.names.input.cols %in% name.required)

    ## -------* END

    return(list(std.names = names(out.input.cols),
                regex = out.input.cols, df.names = df.names.input.cols,
                required = required))
}

## A helper function to retrieve actual column names from a data frame
## given 'standard' names
InternalGetDFColNames <- function(std.names, df, regexp.list = NULL,
                                 write.model.fun = NULL) {
    if (is.null(regexp.list)) regexp.list <- InternalRegExpsInputCols(write.model.fun = write.model.fun)
    if (!all(std.names %in% regexp.list$std.names))
        stop("The following 'std.names' are not proper standard names:\n",
             toString(std.names[!std.names %in% regexp.list$std.names]))

    return(regexp.list$df.names[regexp.list$std.names %in% std.names])
}


###----------------------------------------------------------------------------------
# Regular expressions for cell values in input files.
#
# Put them in a function in this file so that they are standardized across the
# package; any other function that needs them can call this and get the same
# regexps. (Another option is to consider them as 'internal data' and put them
# in 'R/sysdata.rda'.)
#
InternalRegExpsInputCells <- function() {
    ##
    ##  General Notes:
    ##
    ##    Regexps are in alphabetical order.
    ##
    ##    The regexps are case insensitive and any number of any kinds of
    ##    whitespace character is allowed between words (e.g., space, tab,
    ##    newline) as well as a full stop '.'. ALSO no delimiter is also
    ##    matched. E.g., 'Age range', 'Age.range', 'age range', 'AgeRange' all
    ##    match the regexp for 'age.range'.
    ##
    ##  Regexp notes:
    ##
    ##    '?i' makes search case INsensitive
    ##
    ##    [:space:]' matches tab, newline, vertical tab, form feed, carriage
    ##    return, space and possibly other locale-dependent characters.
    ##

    ## -------* Functions to make regular expressions

    re.cell <- function(txt) {
        txt <- unlist(strsplit(txt, "[[:space:]]"))
        txt <- txt[!txt==""] #if there are any multiple spaces, remove them
        txt <- paste(txt, collapse = "[.[:space:]]*")
        paste0("^[.[:space:]]*", "(?i:", txt, ")", "[.[:space:]]*$")
    }

    ## -------* MAIN BODY

    ## -------** Input data cells

    list.names.input.cells <-
        c("data pertain to methods used since the last pregnancy"
         ,"data pertain to past or current use"
         ,"repeated national survey"
         ,"service statistic"
          )

    ## -------** Create regular expressions

    out.input.cells <- lapply(list.names.input.cells, "re.cell")

    ## -------** Name the lists

    ## Names are all lower case with multiple interword spaces replaced with
    ## single spaces. 'make.names()' is then applied, followed by 'tolower()'.

    names(out.input.cells) <- tolower(make.names(list.names.input.cells))

    ## -------* END

    return(out.input.cells)
}
###----------------------------------------------------------------------------------
InternalGetRegionInfoGeneral <- function(# Summarize regional info
### Summarize regional info to construct region.info
             country.info
             ){
        n.subreg <- length(unique(country.info$subreg.c))
        n.reg <- length(unique(country.info$reg.c))
        name.subreg <- rep("", n.subreg)
        name.reg <- rep("", n.reg)
                                        # make sure that name.subreg[1] corresponds to same 1 in country.info
                                        #name.subreg <- levels(as.factor(country.info$subreg.c))
                                        #name.reg <- levels(as.factor(country.info$reg.c))
        for (subreg in 1:n.subreg){
            name.subreg[subreg] <- country.info$namesubreg.c[country.info$subreg.c==subreg][1]
        }
        for (reg in 1:n.reg){
            name.reg[reg] <- country.info$namereg.c[country.info$reg.c==reg][1]
        }
        name.reg.short <- name.reg
        name.reg.short <- ifelse(name.reg=="Latin America and the Caribbean","LAC", paste(name.reg.short))
        name.reg.short <- ifelse(name.reg=="Northern America","N. Am.", paste(name.reg.short))

        ## Sexual Activity
        if(all(c("sex.ac.unm.c", "reg.in.sex.ac.unm.c") %in% colnames(country.info))) {
            n.sex.ac.unm <- length(unique(country.info$sex.ac.unm.c))
            name.sex.ac.unm <- rep("", n.sex.ac.unm)
            for(x in 1:n.sex.ac.unm) {
                name.sex.ac.unm[x] <-
                    country.info$name.sex.ac.unm.c[country.info$sex.ac.unm.c == x][1]
            }
            n.reg.in.sex.ac.unm <- length(unique(country.info$reg.in.sex.ac.unm.c))
            name.reg.in.sex.ac.unm <- rep("", n.reg.in.sex.ac.unm)
            for(x in 1:n.reg.in.sex.ac.unm) {
                name.reg.in.sex.ac.unm[x] <-
                    country.info$name.reg.in.sex.ac.unm.c[country.info$reg.in.sex.ac.unm.c == x][1]
            }
            name.reg.in.sex.ac.unm.short <- name.reg.in.sex.ac.unm
            name.reg.in.sex.ac.unm.short <-
                gsub("Latin America and the Caribbean", "LAC"
                   ,name.reg.in.sex.ac.unm.short)
            name.reg.in.sex.ac.unm.short <-
                gsub("Northern America", "N. Am.", name.reg.in.sex.ac.unm.short)

            ## SA1 subregion, India alone
            n.reg.in.sex.ac.unm.SA1sub <- length(unique(country.info$reg.in.sex.ac.unm.SA1sub.c))
            name.reg.in.sex.ac.unm.SA1sub <- rep("", n.reg.in.sex.ac.unm.SA1sub)
            for(x in 1:n.reg.in.sex.ac.unm.SA1sub) {
                name.reg.in.sex.ac.unm.SA1sub[x] <-
                    country.info$name.reg.in.sex.ac.unm.SA1sub.c[country.info$reg.in.sex.ac.unm.SA1sub.c == x][1]
            }
            name.reg.in.sex.ac.unm.SA1sub.short <- name.reg.in.sex.ac.unm.SA1sub
            name.reg.in.sex.ac.unm.SA1sub.short <-
                gsub("Latin America and the Caribbean", "LAC"
                   ,name.reg.in.sex.ac.unm.SA1sub.short)
            name.reg.in.sex.ac.unm.SA1sub.short <-
                gsub("Northern America", "N. Am.", name.reg.in.sex.ac.unm.SA1sub.short)

        }

        region.info <- list(name.subreg = as.character(name.subreg),
                            name.reg = as.character(name.reg),
                            name.reg.short = as.character(name.reg.short),
                            n.subreg = n.subreg, n.reg = n.reg
                           ,name.sex.ac.unm = as.character(name.sex.ac.unm)
                           ,name.reg.in.sex.ac.unm = as.character(name.reg.in.sex.ac.unm)
                           ,name.reg.in.sex.ac.unm.short = as.character(name.reg.in.sex.ac.unm.short)
                          ,n.reg.in.sex.ac.unm = n.reg.in.sex.ac.unm
                           ,n.sex.ac.unm = n.sex.ac.unm
                           ,name.reg.in.sex.ac.unm.SA1sub = as.character(name.reg.in.sex.ac.unm.SA1sub)
                           ,name.reg.in.sex.ac.unm.SA1sub.short = as.character(name.reg.in.sex.ac.unm.SA1sub.short)
                          ,n.reg.in.sex.ac.unm.SA1sub = n.reg.in.sex.ac.unm.SA1sub
                            )
        ##value<<
        return(region.info##<< Object of class \code{\link{region.info}}
               ## where name.subreg[1] refers to index 1 in country.info$subreg.c etc
               )
    }
###---------------------------------------------------------------------------------------
InternalGetRegionInfoForCountry <- function(# Find (sub)region info for country vector
  ### Find (sub)region info for country vector
  iso.c,
  country.info, ##<< csv with region info
  countrycodes.csv ##<< csv with country codes info
 ,no.data = FALSE
 ,write.model.fun = "WriteModel"
  ){

    regions <- country.info
                                        # change JR, 20140404
    if (!all(grepl("[[:digit:]]", iso.c))) {
        if (all(grepl("[[:alpha:]]", iso.c))) {
            iso.c <- InternalGetCountryCodes(iso.c = iso.c, countrycodes.csv = countrycodes.csv)
        } else {
            stop("Elements of iso.c must be all alphabet letters or all digits.")
        }
    }

    C <- length(iso.c)

    dev.c <-  ldev.c <- namesubreg.c <- namereg.c <- ssa.c <- fp2020.c <- rep("", C)
    for (c in 1:C){
        namesubreg.c[c] <- paste(regions$Region[which.max(regions$ISO.code==iso.c[c])])
        namereg.c[c] <- paste(regions$Major.area[which.max(regions$ISO.code==iso.c[c])])
        ssa.c[c] <- paste(regions$Sub.Saharan..Africa[which.max(regions$ISO.code==iso.c[c])])
        dev.c[c] <- ifelse(paste(regions$Least.developed.country[which.max(regions$ISO.code==iso.c[c])])=="Yes","Poor",
                    ifelse(paste(regions$Developed..region[which.max(regions$ISO.code==iso.c[c])])=="Yes", "Rich", "Med"))
        fp2020.c[c] <- paste(regions$FP2020.country[which.max(regions$ISO.code==iso.c[c])])
    }

    ## for 'no.data' don't include the numeric codes for region and
    ## subregion. These are made separately to ensure they are the same for
    ## countries with and without data.
    if(no.data) {
        reg.country.info.c <- data.frame(as.character(iso.c),
                                         namereg.c,
                                         namesubreg.c,
                                         ssa.c, dev.c,
                                         fp2020.c,
                                         stringsAsFactors=FALSE)
        names(reg.country.info.c) <- c("iso.c", "namereg.c",
                                       "namesubreg.c", "ssa.c", "dev.c"
                                      ,"fp2020.c")
    } else {
        subreg.c <- as.numeric(as.factor(namesubreg.c))
        reg.c <- as.numeric(as.factor(namereg.c))
        reg.country.info.c <- data.frame(as.character(iso.c),
                                         reg.c, namereg.c,
                                         subreg.c, namesubreg.c,
                                         ssa.c, dev.c,
                                         fp2020.c,
                                         stringsAsFactors=FALSE)
        names(reg.country.info.c) <- c("iso.c", "reg.c", "namereg.c",
                                       "subreg.c", "namesubreg.c", "ssa.c", "dev.c"
                                      ,"fp2020.c")
    }

    ##
    ## Sexual activity classifications (do separately here so it's easy to see)
    ##
    if("Sexual.activity.among.unmarried" %in% colnames(regions)) {

        name.sex.ac.unm.c <- name.reg.in.sex.ac.unm.c <- name.subreg.in.sex.ac.unm.c <-
            name.reg.in.sex.ac.unm.SA1sub.c <- name.subreg.in.sex.ac.unm.SA1sub.c <- rep(NA, C)
                                        # 'name.reg.in.sex.ac.unm.c' is a new
                                        # variable: major region nested in
                                        # sexual activity category
        for(c in 1:C) {
            value.sex.ac.c <- regions$Sexual.activity.among.unmarried[which.max(regions$ISO.code==iso.c[c])]
            value.reg.c <- regions$Major.area[which.max(regions$ISO.code==iso.c[c])]
            value.subreg.c <- regions$Region[which.max(regions$ISO.code==iso.c[c])]

            name.sex.ac.unm.c[c] <- paste(value.sex.ac.c) # 0 or 1

            ## [MCW-2017-06-16-10] :: Make new columns in ~region.country.info.c~ to
            ## implement the new model for sexual activity categories:
            ## 1. If sexual activity group == 1, use subregions instead of major areas.
            ## 2. India is its own subregion and major area.

            value.reg.SA1sub.c <- value.reg.c
            value.reg.SA1sub.c[value.sex.ac.c == 1] <- value.subreg.c
            value.reg.SA1sub.c[iso.c[c] == "356"] <- #India
                regions$Country.or.area[regions$ISO.code == iso.c[c]]

            value.subreg.SA1sub.c <- value.subreg.c
            value.subreg.SA1sub.c[iso.c[c] == "356"] <- #India
                regions$Country.or.area[regions$ISO.code == iso.c[c]]

            ## Add SA labels.
            name.reg.in.sex.ac.unm.c[c] <-
                paste0(value.reg.c
                       ,ifelse(value.sex.ac.c == 1, " (SA high)", " (SA low)")
                      )
            name.subreg.in.sex.ac.unm.c[c] <-
                paste0(value.subreg.c
                       ,ifelse(value.sex.ac.c == 1, " (SA high)", " (SA low)")
                      )
            name.reg.in.sex.ac.unm.SA1sub.c[c] <-
                paste0(value.reg.SA1sub.c
                       ,ifelse(value.sex.ac.c == 1, " (SA high)", " (SA low)")
                      )
            name.subreg.in.sex.ac.unm.SA1sub.c[c] <-
                paste0(value.subreg.SA1sub.c
                       ,ifelse(value.sex.ac.c == 1, " (SA high)", " (SA low)")
                      )
        }
        reg.country.info.c$name.sex.ac.unm.c <-
            name.sex.ac.unm.c
        reg.country.info.c$name.reg.in.sex.ac.unm.c <-
            name.reg.in.sex.ac.unm.c
        reg.country.info.c$name.subreg.in.sex.ac.unm.c <-
            name.subreg.in.sex.ac.unm.c
        reg.country.info.c$name.reg.in.sex.ac.unm.SA1sub.c <-
            name.reg.in.sex.ac.unm.SA1sub.c
        reg.country.info.c$name.subreg.in.sex.ac.unm.SA1sub.c <-
            name.subreg.in.sex.ac.unm.SA1sub.c

        if(!no.data) {
            reg.country.info.c$sex.ac.unm.c <-
                as.numeric(as.factor(name.sex.ac.unm.c))
            reg.country.info.c$reg.in.sex.ac.unm.c <-
                    as.numeric(as.factor(reg.country.info.c$name.reg.in.sex.ac.unm.c))
            reg.country.info.c$subreg.in.sex.ac.unm.c <-
                    as.numeric(as.factor(reg.country.info.c$name.subreg.in.sex.ac.unm.c))
            reg.country.info.c$reg.in.sex.ac.unm.SA1sub.c <-
                    as.numeric(as.factor(reg.country.info.c$name.reg.in.sex.ac.unm.SA1sub.c))
            reg.country.info.c$subreg.in.sex.ac.unm.SA1sub.c <-
                    as.numeric(as.factor(reg.country.info.c$name.subreg.in.sex.ac.unm.SA1sub.c))
        }
    }
    ##value<< Data frame with (iso.c, reg.c, namereg.c, subreg.c, namesubreg.c, ssa.c, dev.c)
    ## where ssa.c and dev.c are factors,
    ## and reg.c and subreg.c are just integers
    ## part of Object \code{\link{country.info}}.
    return(reg.country.info.c)
}
###----------------------------------------------------------------------------------
InternalGetCountryCodes <- function(# Get ISO 3-digit country codes from character codes and vice versa.
  ### Get vector of 3-digit country codes from character codes and vice versa.
  iso.c,
  countrycodes.csv ##<< csv file with ISO 3-character and 3-digit country codes
) {
  # change JR, 20140404
  countrycodes <- read.csv(countrycodes.csv, header = T, stringsAsFactors = F)
  countrycodes$ISO.code <- as.character(countrycodes$ISO.code)
  iso.c <- as.character(iso.c)
  if (all(grepl("[[:digit:]]", iso.c))) {
    iso.c.output <- join(data.frame(ISO.code = iso.c), countrycodes)$Country.letter.code
  } else if (all(grepl("[[:alpha:]]", iso.c))) {
    iso.c.output <- join(data.frame(Country.letter.code = iso.c), countrycodes)$ISO.code
  } else {
    stop("Elements of iso.c must be all alphabet letters or all digits.")
  }
  if (any(is.na(iso.c.output)))
    warning(paste0("The country code(s) ", paste(iso.c[is.na(iso.c.output)], collapse = ", "),
                   " cannot be found in countrycodes.csv!"))
  return(iso.c.output)
}
###----------------------------------------------------------------------------------
## [MCW-2017-02-08-1] :: Created to harmonize the numbering of model clusters across countries with data and those without.
InternalFixNumericClusterCodes <- function(country.info, country.info.no.data)
{
    ## -------* Sub Functions

    MakeNumDf <- function(name.col, num.col) {
        all.clus <- levels(factor(c(country.info[[name.col]], country.info.no.data[[name.col]])))
        num.code.df <- data.frame(all.clus, 1:length(all.clus))
        colnames(num.code.df) <- c(name.col, num.col)
        return(num.code.df)
        }

    ## -------* MAIN PART

    ## -------** Data Frames w Numeric Codes

    regions.num.code.df <- MakeNumDf("namereg.c", "reg.c")
    subregions.num.code.df <- MakeNumDf("namesubreg.c", "subreg.c")
    sex.ac.num.code.df <- MakeNumDf("name.sex.ac.unm.c", "sex.ac.unm.c")
    reg.in.sex.ac.num.code.df <- MakeNumDf("name.reg.in.sex.ac.unm.c", "reg.in.sex.ac.unm.c")
    subreg.in.sex.ac.num.code.df <- MakeNumDf("name.subreg.in.sex.ac.unm.c", "subreg.in.sex.ac.unm.c")
    reg.in.sex.ac.SA1sub.num.code.df <-
        MakeNumDf("name.reg.in.sex.ac.unm.SA1sub.c", "reg.in.sex.ac.unm.SA1sub.c")
    subreg.in.sex.ac.SA1sub.num.code.df <-
        MakeNumDf("name.subreg.in.sex.ac.unm.SA1sub.c", "subreg.in.sex.ac.unm.SA1sub.c")

    ## -------** Fix Country Infos

    ## -------*** With Data

    ## Remember the order
    country.info$order <- 1:nrow(country.info)

    ## Merge
    country.info <-
        merge(country.info[,!colnames(country.info)=="reg.c"]
             ,regions.num.code.df, sort = FALSE
              )
    country.info <-
        merge(country.info[,!colnames(country.info)=="subreg.c"]
             ,subregions.num.code.df, sort = FALSE
              )
    country.info <-
        merge(country.info[,!colnames(country.info)=="sex.ac.unm.c"]
             ,sex.ac.num.code.df, sort = FALSE
              )
    country.info <-
        merge(country.info[,!colnames(country.info)=="reg.in.sex.ac.unm.c"]
             ,reg.in.sex.ac.num.code.df, sort = FALSE
              )
    country.info <-
        merge(country.info[,!colnames(country.info)=="subreg.in.sex.ac.unm.c"]
             ,subreg.in.sex.ac.num.code.df, sort = FALSE
              )
    country.info <-
        merge(country.info[,!colnames(country.info)=="reg.in.sex.ac.unm.SA1sub.c"]
             ,reg.in.sex.ac.SA1sub.num.code.df, sort = FALSE
              )
    country.info <-
        merge(country.info[,!colnames(country.info)=="subreg.in.sex.ac.unm.SA1sub.c"]
             ,subreg.in.sex.ac.SA1sub.num.code.df, sort = FALSE
              )

    ## Re-order
    country.info <-
        country.info[order(country.info$order)
                    ,!(colnames(country.info) == "order")
                     ]

    ## -------*** No Data

    ## Remember the order
    country.info.no.data$order <- 1:nrow(country.info.no.data)

    ## Merge
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="reg.c"]
             ,regions.num.code.df, sort = FALSE
              )
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="subreg.c"]
             ,subregions.num.code.df, sort = FALSE
              )
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="sex.ac.unm.c"]
             ,sex.ac.num.code.df, sort = FALSE
              )
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="reg.in.sex.ac.unm.c"]
             ,reg.in.sex.ac.num.code.df, sort = FALSE
              )
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="subreg.in.sex.ac.unm.c"]
             ,subreg.in.sex.ac.num.code.df, sort = FALSE
              )
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="reg.in.sex.ac.unm.SA1sub.c"]
             ,reg.in.sex.ac.SA1sub.num.code.df, sort = FALSE
              )
    country.info.no.data <-
        merge(country.info.no.data[,!colnames(country.info.no.data)=="subreg.in.sex.ac.unm.SA1sub.c"]
             ,subreg.in.sex.ac.SA1sub.num.code.df, sort = FALSE
              )

    ## Re-order
    country.info.no.data <-
        country.info.no.data[order(country.info.no.data$order)
                    ,!(colnames(country.info.no.data) == "order")
                     ]

    ## -------* Finish

    return(list(country.info = country.info
               ,country.info.no.data = country.info.no.data
                ))
}
###----------------------------------------------------------------------------------
InternalMakeCountryNamesShort <- function(# Shorten country names (and make consistent)
  ### Shorten country names (and make consistent)
  name.c){
    name.c = ifelse(name.c=="Cote d Ivoire", paste("Cote d'Ivoire"), paste(name.c))
    name.c = ifelse(name.c=="Sao Tome and Principe" |name.c=="Sao Tome & Principe", paste("Sao Tome Pr"), paste(name.c))
  name.c = ifelse(name.c=="Vietnam", paste("Viet Nam"), paste(name.c))
  name.c = ifelse(name.c=="Gambia The", paste("Gambia"), paste(name.c))
  name.c = ifelse(name.c=="Brunei Darussalam", paste("Brunei"), paste(name.c))
  name.c = ifelse(name.c=="Saint Kitts and Nevis", paste("Saint Kitts & Nevis"), paste(name.c))
  name.c = ifelse(name.c=="Timor Leste", paste("Timor-Leste"), paste(name.c))
  name.c = ifelse(name.c=="Dominican Rep.", paste("Dominican Republic"), paste(name.c))
  name.c = ifelse(name.c=="Syrian Arab Republic", paste("Syria"), paste(name.c))
  name.c = ifelse(name.c=="GuineaBissau", paste("Guinea-Bissau"), paste(name.c))
  name.c = ifelse(name.c=="Libyan Arab Jamahiriya", paste("Libya"), paste(name.c))
  name.c = ifelse(name.c=="Ukraine ", paste("Ukraine"), paste(name.c))
  name.c = ifelse(name.c=="Republic of Moldova"|name.c=="Moldova, Rep. of", paste("Moldova"), paste(name.c))
  name.c = ifelse(name.c=="Federated States of Micronesia"|name.c=="Micronesia (Federated States of )"|
  name.c=="Micronesia, Federated States of"|name.c=="Micronesia, Fed. States of"|
      name.c=="Micronesia (Fed. States of)", paste("Micronesia"), paste(name.c))
    name.c = ifelse(name.c=="United Kingdom", paste("U.K."), paste(name.c))
    name.c = ifelse(name.c=="United Kingdom of Great Britain and Northern Ireland", paste("U.K."), paste(name.c))
  name.c = ifelse(name.c=="United States of America"|name.c=="United States", paste("U.S."), paste(name.c))
  name.c = ifelse(name.c=="Congo, Dem. Rep."|name.c=="Democratic Republic of the Congo"|name.c=="Congo DR", paste("DRC"), paste(name.c))
  name.c = ifelse(name.c=="The former Yugoslav Republic of Macedonia"|name.c=="TFYR Macedonia", paste("Macedonia"), paste(name.c))
  name.c = ifelse(name.c=="Bosnia and Herzegovina"|name.c=="Bosnia & Herzegovina", paste("Bosn&Herze"), paste(name.c))
  name.c = ifelse(name.c=="Trinidad and Tobago"|name.c=="Trinidad & Tobago", paste("Trinidad&T"), paste(name.c))
  name.c = ifelse(name.c=="China, Hong Kong SAR", paste("Hong Kong"), paste(name.c))
  name.c = ifelse(name.c=="China, Macao Special Administrative Region", paste("Macao"), paste(name.c))
  name.c = ifelse(name.c=="China, Hong Kong Special Administrative Region", paste("Hong Kong"), paste(name.c))
  name.c = ifelse(name.c=="United Repulic of Tanzania",   paste("Tanzania"), paste(name.c))
  name.c = ifelse(name.c=="United States Virgin Islands",   paste("US Virgin Isl."), paste(name.c))
  name.c = ifelse(name.c=="United Arab Emirates",   paste("Arab Emirates"), paste(name.c))
  name.c = ifelse(name.c=="Lao People's Democratic Republic"|name.c=="Lao People's Dem. Rep."|
        name.c =="Lao PDR",   paste("Laos"), paste(name.c))
  name.c = ifelse(name.c=="Republic of Korea"|name.c == "Republic of Korea "|
        name.c == "Korea Rep"|name.c=="Korea, Rep. of", paste("South Korea"), paste(name.c))
  name.c = ifelse(name.c=="Democratic People's Republic of Korea"|name.c=="Korea DPR"|
        name.c=="Dem. People's Republic of Korea"|name.c=="Korea, Dem. People's Rep.", paste("North Korea"), paste(name.c))
  name.c = ifelse(name.c=="Central African Republic"|name.c=="Central African Rep.", paste("CAR"), paste(name.c))
  name.c = ifelse(name.c=="Iran (Islamic Republic of)"| name.c == "Iran, Islamic Republic of", paste("Iran"), paste(name.c))
  name.c = ifelse(name.c=="United Republic of Tanzania"| name.c=="Tanzania, United Republic of",   paste("Tanzania"), paste(name.c))
  name.c = ifelse(name.c=="Venezuela (Bolivarian Republic of)",   paste("Venezuela"), paste(name.c))
  name.c = ifelse(name.c=="Bolivia (Plurinational State of)",   paste("Bolivia"), paste(name.c))
  name.c = ifelse(name.c=="Antigua and Barbuda"|name.c=="Antigua & Barbuda", paste("Antigua and B."), paste(name.c))
  name.c = ifelse(name.c=="Northern Mariana Islands", paste("N. Mariana Isl."), paste(name.c))
  name.c = ifelse(name.c=="Occupied Palestinian Territory"|name.c=="OPT",
                  #paste("Occ. Palestinian Terr."),
                  paste("OPT"),
                  paste(name.c))
  name.c = ifelse(name.c=="Saint Vincent and the Grenadines" |
      name.c=="Saint Vincent & the Grenadines" |name.c=="Saint Vincent/Grenadines"|
      name.c == "St Vincent & the Grenadines", paste("St. Vincent & Gren."), paste(name.c))
  ##value<< Vector with length of \code{name.c}, but some names replaced
  return(name.c)
}

###----------------------------------------------------------------------------------
InternalWhichReg <- function(# Find region for vector of country iso codes
  ### Find region for vector of country iso codes
  iso.j, ##<< vector of country iso codes
  iso.c, ##<< vector with unique country iso codes
  reg.c##<< vector with region integers, corresponding to iso.c
  ){
  reg.j <- rep(NA, length(iso.j))
  for (iso in unique(iso.j)){
    reg <- reg.c[which.max(iso.c == iso)]
    reg.j[iso.j==iso] <- reg
  }
  ##value<< vector of length \code{iso.j} with region codes
  return(reg.j)
}

###----------------------------------------------------------------------------------
InternalInternalWhichSubreg <- function(# Find region for vector of country iso codes
  ### Find region for vector of country iso codes
  iso.j, ##<< vector of country iso codes
  iso.c, ##<< vector with unique country iso codes
  subreg.c##<< vector with sub-region integers, corresponding to iso.c
  ){
  subreg.j <- rep(NA, length(iso.j))
  for (iso in unique(iso.j)){
    subreg <- subreg.c[which.max(iso.c == iso)]
    subreg.j[iso.j==iso] <- subreg
  }
  ##value<< vector of length \code{iso.j} with subregion codes
  return(subreg.j)
}
###----------------------------------------------------------------------------------
GetObsCounts <- function(# Find proportion of countries with 0, 1, nres observations
  ### Find proportion of countries with 0, 1, nres observations
  data.j, ##<< observations, can include NAs
  iso.j, #<< ID for country
  select.j = NULL, ##<< subset of observations to select (NULL = all)
  selected.isos  = NULL,##<< set of iso codes to select (NULL = all)
  nres = 5 ##<< nres gives max obs, e.g. nres = 4 means 4+ is the last open-ended cat
  ## set to max(nres, 5) or min(4, nres)
  ){
  if (is.null(select.j)){
    select.j <- rep(T, length(data.j))
  }
  if (is.null(selected.isos)){
    selected.isos <- unique(iso.j)
  }
  # isos for countries with observations in the period
  isos.select <- iso.j[!is.na(data.j) & select.j==TRUE & is.element(iso.j, selected.isos)]
  if (nres >5) nres = 5
  if (nres <4) nres = 4
  if (length(isos.select)==0){
    res <- c(1, rep(0, nres))
    names(res) <- c("0", "1", "2", "3", "4", "5+")[1:(1+nres)]
    return(res)
  }
  # all countries, with or without observations in that particular period
  isos.all <- iso.j[ is.element(iso.j, selected.isos)]#select.j==TRUE &
  bla <- table(table(isos.select))
  if (nres==4){
    res <- c(length(unique(isos.all)) -  length(unique(isos.select)), # countries without data
             sum(bla["1"], na.rm =T), sum(bla["2"], na.rm =T), sum(bla["3"], na.rm =T),
             sum(bla[as.numeric(names(bla))>=4], na.rm =T)
             )
    names(res) <- c("0", "1", "2", "3", "4+")
  } else {
    res <- c(length(unique(isos.all)) -  length(unique(isos.select)),
             sum(bla["1"], na.rm =T), sum(bla["2"], na.rm =T), sum(bla["3"], na.rm =T),
             sum(bla["4"], na.rm =T),
             sum(bla[as.numeric(names(bla))>=5], na.rm =T)
             )
    names(res) <- c("0", "1", "2", "3", "4", "5+")
  }
  ##value<< vector with number of countries with 0,..,nres observations
  return(res/sum(res, na.rm = T))
}
###----------------------------------------------------------------------------------
InternalPlotPropsDataAvailability <- function(# Plot proportions of data availability
  ### Internal function for plotting proportions of data availability
  ymin, ##<< which height should stuff be plotted?
  res, ##<< proportions to be plotted
  col = 2, ##<< color to visualize proportion
  include.zero ##<< should first box with zero be included?
  ){
  yvalues <- ymin + res
  istart <- ifelse(include.zero,1,2)
  for (i in istart:length(res)){
    polygon(c(i-1,i-1, i,i,i-1),
            c(ymin,yvalues[i], yvalues[i], ymin, ymin),
            col = col, border = NA)
    # get box back
    polygon(c(i-1,i-1, i,i,i-1),
            c(ymin,ymin+1, ymin+1, ymin, ymin),
            bg = NA, border = 1)
    text(i-0.5,ymin+0.55, labels = round(100*res[i]), cex =1, col = 1)
  }
}

###----------------------------------------------------------------------------------
PlotDataAvailability <- function(# Create a plot to visualize data availability.
  ### Create a plot to visualize data availability.
  data, ##<< Object of \code{data}
  country.info, ##<< Object of \code{country.info}
  nres = 5, ##<< Determines the number of columns (nres+ is the last column, max is 5).
  summarize.unmet = FALSE, ##<< Logical: total prevalence is summarized when FALSE, unmet need when TRUE.
  years = c(1990, 2000, 2010),
  use.SDG.regions = FALSE
  ){

    if (!summarize.unmet){
        data.j <- data$props.tot.j
    } else {
        data.j <- data$props.unmet.j
    }
    iso.j  <- data$iso.j

    if(!is.null(years)) {
        years <- sort(unique(c(0, years, Inf)))
    } else stop("'years' must be non-NULL.")

    ## get regional info
    isos.Lr <- list()
    if(use.SDG.regions) {
        country.info <- country.info[country.info$iso.country %in% unique(iso.j),]
        isos.Lr[["All"]] = paste(country.info$iso.country)
        for(x in unique(country.info$groupname)) {
            isos.Lr[[x]] <- paste(country.info$iso.country[country.info$groupname == x])
        }
    } else {
        isos.Lr[["All"]] = paste(country.info$iso.c)
        isos.Lr[["Developed"]] = paste(country.info$iso.c[country.info$dev.c=="Rich"])
        isos.Lr[["Africa"]] = paste(country.info$iso.c[country.info$dev.c!="Rich"&
                                                       country.info$namereg.c=="Africa"])
        isos.Lr[["Asia"]] =paste(country.info$iso.c[country.info$dev.c!="Rich"&
                                                    country.info$namereg.c=="Asia"])
        isos.Lr[["LAC"]] =paste(country.info$iso.c[country.info$dev.c!="Rich"&
                                                   country.info$namereg.c=="Latin America and the Caribbean"])
        isos.Lr[["Oceania"]] =paste(country.info$iso.c[country.info$dev.c!="Rich"&
                                                       country.info$namereg.c=="Oceania"])
    }
    regs <- names(isos.Lr)
    nc.r <- list() # number of countries in each region
    for (reg in regs){
        nc.r[[reg]] <- length(unique(isos.Lr[[reg]]))
    }

  include.zero = TRUE
  #res <- GetObsCounts(data.j = data.j, iso.j = iso.j)
  # could be that break down by region+period introduces zeroes
  #if (res[1]>0 ) include.zero = TRUE

    if(use.SDG.regions) {
        par(mfrow = c(1,1), mar = c(5,25,1,1), cex.main = 1.5, cex.axis = 1.5, cex.lab = 1.5)
    } else {
        par(mfrow = c(1,1), mar = c(5,10,1,1), cex.main = 1.5, cex.axis = 1.5, cex.lab = 1.5)
    }
  plot(1, type = "n", ylim = c(0, -1 + length(regs) * (length(years) + 1)), bty = "n",
       xlim = c(as.numeric(!include.zero),nres+1), xaxt = "n", yaxt = "n",
       ylab = "", xlab = "Number of observations",
       main = "")
  axis(1, at = 0.5+seq(0,nres), labels = c(seq(0,nres-1), paste(nres, "+", sep = "")))
  # year is the cutoff moment (1990.0 is in second row)

  yearnow <- max(floor(data$years.j))+1 # note: 2011 should give 2012 as the end point
    labelsperiods <- character(length(years) - 1)
    labelsperiods[1] <- paste0("Before ", years[2])
    for(i in 2:(length(labelsperiods) - 1)) {
        labelsperiods[i] <- paste0(years[i], "-", years[i + 1] - 1)
    }
    labelsperiods[length(labelsperiods)] <- paste0(years[length(labelsperiods)], "-", yearnow)
  ymin <- ystart <- -1 + length(regs) * (length(years) + 1)
  namesregs <- paste(regs, " (", unlist(nc.r), ")", sep = "")
  for (reg in 1:length(regs)){
    axis(2, at = -1+ymin-seq(1,length(years)), labels = rep("",length(years)), las = 1, tick=TRUE)
    axis(2, at = -1.5+ymin-seq(1,length(years)-1), labels = labelsperiods, las = 1, tick=FALSE)
    #  axis(2, at = -1+ymin-seq(1,5), labels = c("", years[-1]), las = 1)
    axis(2, at = -1+ymin-0.5, labels = paste(namesregs[reg], "", sep = ""), las = 1)
    ymin <- ymin-2
    region <- regs[reg]
    iso.select <- isos.Lr[[region]]
    res <- GetObsCounts(data.j = data.j, iso.j = iso.j,
                        selected.isos = iso.select)
    InternalPlotPropsDataAvailability(col = 3,ymin = ymin, res = res, include.zero = include.zero)
    for (t in 2:length(years)){
      ymin <- ymin-1
      res <- GetObsCounts(data.j = data.j, iso.j = iso.j,
                          select.j = (data$years.j>=years[t-1] & data$years.j<years[t]),
                          selected.isos = iso.select)
      InternalPlotPropsDataAvailability(col = "lightblue",ymin = ymin, res = res, include.zero = include.zero)
    }
  }# end plot

  if (include.zero) segments(1, -1, 1, ystart-1, lwd = 5)
  ##value<< NULL. Plot appears in R-console.
  return(invisible())
}


## [MCW-2016-03-11-1]: CREATED THIS FUNCTION. Based on InternalFixRange()
InternalFixEndYear <- function(# Fix range character vector that got converted to dates in Excel
  ### Replaces month names with numbers
  x
  ) {
  x <- as.character(x)
  from <- c("Jun-05", "Jul-05")
  to <- c(2006, 2007)
  sub.table <- data.frame(from = from, to = to, stringsAsFactors = FALSE)
  for (i in 1:nrow(sub.table))
    x <- gsub(sub.table$from[i], sub.table$to[i], x)
    return(as.numeric(x))
    return(x)
}
###----------------------------------------------------------------------------------
##' @importFrom magrittr %>%
extractDenominators <- function(denominator_csv, in_union) {
  data <- read.csv(
    file = denominator_csv,
    header = TRUE,
    stringsAsFactors = FALSE,
    na.strings = c("", "NA")
  )

  verifyDenominators(data, in_union = in_union)

  if (!any(names(data) == "In.union")) {
    return(data)
  }

  data <-data %>% dplyr::filter(In.union == in_union) %>% dplyr::select(-In.union)

  # Remove empty columns
  data[, sapply(data, function(i) {
    !all(is.na(i))
  })]
}
###----------------------------------------------------------------------------------
verifyDenominators <- function(x, in_union) {
    if(is.data.frame(x)) {
        temp_denom <- x
        fname <- "denominator counts data frame"
    } else if(is.character(x) && file.exists(x)) {
        temp_denom <- extractDenominators(denominator_csv = x, in_union = in_union)
        return(TRUE)
    } else {
        stop("'x' must be a data frame or a valid file path.")
    }

    ## Check for required columns
    if(!all(c("ISO.code", "Country") %in% colnames(temp_denom)))
        stop("'", fname, "' must have columns 'ISO.code' and 'Country'.")
    if(!("In.union" %in% colnames(temp_denom))) {
        temp_denom_ccount <- table(temp_denom$`ISO.code`)
        ccount_gt_1 <- names(temp_denom_ccount[temp_denom_ccount > 1])
        if(length(ccount_gt_1) > 0) {
            stop("Countries with ISO codes ", paste(ccount_gt_1, collapse = ", "), " occur in '",
                 fname,
                 "' more than once. \n\tIf the file is for married and unmarried, the column identifying marital status must be called 'In union' (case sensitive).")
        }
    } else {
        ## Check 'in union' has correct entries
        if(!(all(temp_denom$`In.union` %in% c(0, 1))))
            stop("'In.union' in '",
                 fname,
                 "' must only contain '0' or '1'.")
        ccount_by_iu <- as.data.frame(table(temp_denom$`In.union`, temp_denom$`ISO.code`),
                                      stringsAsFactors = FALSE)
        ccount_gt_1 <- ccount_by_iu$Var2[ccount_by_iu$Freq > 1]
        if(length(ccount_gt_1 > 0))
            stop("Countries with ISO codes ",
                 paste(ccount_gt_1, collapse = ", "),
                 "occur more than twice in '",
                 fname, "'.")
    }

    ## Check that counts are all numeric
    year_cols <-
        colnames(temp_denom)[!(colnames(temp_denom) %in% c("Country.letter.code", "ISO.code", "Country", "In.union"))]
    bad_cols <- character(0)
    for(nm in year_cols) {
        if(any(is.character(temp_denom[,nm]))) bad_cols <- c(bad_cols, nm)
    }
    if(length(bad_cols > 0))
        stop("'", fname, "' has non-numeric entries in column(s) '",
             paste(bad_cols, collapse = ", "), "'. Note that the country names should be in a column called 'Country', country letter codes in a column called 'Country letter code', the ISO codes in a column called 'ISO.code' and the marital group identifier in a column called 'In.union'.")
    return(TRUE)
}










###----------------------------------------------------------------------------------
### The End.
FPcounts/FPEMglobal documentation built on July 20, 2024, 2:35 a.m.