R/sum_acs.R

Defines functions sumacs

Documented in sumacs

#' @title Estimate proportions, ratios, aggregations and the respective margins
#'   of error (MOEs).
#' @description The \code{sumacs} function uses outputs from the
#'   \code{\link{acs}} package to compute proportions, ratios and aggregations
#'   based on text formulas, or simply extract variables. The function
#'   downloads the data and then estimate the formulas. If the function is used
#'   without specifying any \code{data}, remember to define a key using the
#'   \code{\link{acs}} command \code{api.key.install(key="*")}.
#' @param formula A character or vector of characters containing formulas using
#'   ACS/Census variables or just variables. + - operators can be included. / defines a
#'   division. When the formula contains "* 100", the final statistic and MOE is
#'   multiply by 100.
#' @param varname A character or vector of characters containing the new
#'   variables to be created. This vector must have same length as
#'   \code{formula} and \code{method}, and it is not needed when getting only
#'   variables.
#' @param method A character or vector of characters defining the type of
#'   estimate expected: "proportion", "ratio", "aggregation", "variable". This
#'   vector must have same length as \code{formula} and \code{varname}. It is
#'   not needed when getting only variables. Default value "variable".
#' @param level A character or vector of characters specifying the geographic
#'   level of the data. It may be necessary to specificy values to the
#'   corresponding levels. For instance, when \code{level = "county"}, you have
#'   to specify a state (e.g., \code{state = "WI"}, the default state in this
#'   package). You can also use a wildcard method (\code{state = "*"}) to
#'   include all the states. Below, you can see the required combinations of
#'   different summary levels.
#'
#'   \cr 010 us \cr 020 region \cr 030 division \cr 040 state \cr 050 state,
#'   county \cr 060 state, county, county.subdivision \cr 140 state, county,
#'   tract \cr 150 state, county, tract, block.group \cr 160 state, place \cr
#'   250 american.indian.area \cr 320 state, msa \cr 340 state, csa \cr 350
#'   necta \cr 400 urban.area \cr 500 state, congressional.district \cr 610
#'   state, state.legislative.district.upper \cr 620 state,
#'   state.legislative.district.lower \cr 795 state, puma \cr 860 zip.code \cr
#'   950 state, school.district.elementary \cr 960 state,
#'   school.district.secondary \cr 970 state, school.district.unified \cr
#'
#' When \code{combine == TRUE}, the geographic information should be in a list.
#' @param combine Whether the geographies are to be combined. If \code{combine = TRUE},
#' lists should be used when specifying geographic levels (the corresponding level
#' for the level specified). If the rest geographic levels has one element,
#' the function will assume that level is equal for all the sub-levels.
#' For example, if \code{state = "WI"}, and several counties were specified,
#' the function assumes that all the counties are from WI.
#' @param combine.name Label for the aggregate geography when combining levels.
#' The default value is \code{aggregate}.
#' @param dataset A string or vector of strings specifying the data set to be used: acs, sf1 or sf1.
#' The default value is "acs".
#' @param endyear An integer or vector of integers (default is 2014) indicating the latest year of
#'   the data in the survey or Census year.
#' @param span An integer indicating the span (in years) of the desired ACS data
#'   (should be 1, 3, or 5), defaults to 5.
#' @param conf.level Confidence level to estimate MOEs. The default value is
#'   0.90.
#' @param one.zero Whether to include standard errors for only one zero-value
#'   (max value) of columns or all. The default is TRUE.
#' @param trace Shows progress of the variable creation. The default is TRUE.
#' @param data Input data generated by the \code{\link{acsdata}} function.
#'   Variables and levels must be the same as those specified in the
#'   \code{sumacs} function.
#' @param format.out Format of the output: "wide" or "long". The default is
#'   "wide".
#' @param file The resulting output is exported to a CSV file rather than to the R prompt. The file name must be specified as a character string.
#' @param print.levels Boolean that print levels generated by the \code{geo.make} function.
#' @return Returns a \code{data.table/data.frame} object with the estimates and
#'   MOEs.
#' @details When the standard error of a proportion cannot be estimated, the
#'   "ratio" option is used. This adjustment is done row by row.
#' @note Depending on the quality of the internet connection, number of
#'   variables and levels, getting the ACS/Census data can be slow, especially for the
#'   levels "county.subdivision", "block.group", and "tract" (it might take more than 30 minutes).  It is recommended to get the data using the function
#'   \code{\link{acsdata}} first, and then to use \code{sumacs}.
#' @examples
#' # api.key.install(key="*")
#'
#' # without combining
#' sumacs(formula = "(b16004_004 + b16004_026 + b16004_048 / b16004_001)",
#'  varname = "langspan0913", method = "prop")
#'
#' # combining
#' sumacs("(b16004_004 + b16004_026 + b16004_048 / b16004_001)",
#'  varname = "test",
#'  method = "prop",
#'  level = c("block.group"),
#'  state = list("WI"),
#'  county = list(1, 141),
#'  tract = list(950100, 11700),
#'  block.group = list(1:2, 1:2),
#'  combine = TRUE)
sumacs  <- function(formula, varname = NULL, method = "variable",
                    level = "state",
                    dataset = "acs", endyear = 2014, span = 5, conf.level = 0.90,
                    one.zero = TRUE, trace = TRUE, data = NULL,
                    format.out = "wide", file = NULL, print.levels = TRUE,
                    us = "*",
                    region = "*",
                    division = "*",
                    state = "WI",
                    county = "*",
                    county.subdivision ="*",
                    place ="*",
                    tract = "*",
                    block.group = "*",
                    msa = "*",
                    csa = "*",
                    necta = "*",
                    urban.area = "*",
                    congressional.district = "*",
                    state.legislative.district.upper = "*",
                    state.legislative.district.lower = "*",
                    puma = "*",
                    zip.code = "*",
                    american.indian.area = "*",
                    school.district.elementary = "*",
                    school.district.secondary = "*",
                    school.district.unified = "*",
                    combine = FALSE,
                    combine.name = "aggregate")  {

    # define new name file to export csv
    newfile <- NULL
    if (!is.null(file)) {
      newfile <- file
      file <-  NULL
    }

    if (length(varname) > 1 & any(duplicated(varname))) {
      stop("Duplicated varnames, please define unique varnames!")

    }
    # initial tests
    if (length(formula) > 1 & length(method) == 1) {
      method <- rep(method, length(formula))
    }

    if (length(varname)==0 & all(method=='variable')) {
      varname <- getvars(formula)
    }

    if (length(varname)==0 & all(method!='variable')) {
      varname <- paste0("var", 1:length(formula))
      print("Specify varname to get custom variable names!")
    }

    # create dataset combining features
    if (length(dataset) == 1) {  dataset <- rep(dataset, length(formula)) }
    if (length(endyear) == 1) {  endyear <- rep(endyear, length(formula)) }

    if (!all.equal(length(dataset), length(endyear), length(formula))) {
      stop("Formula, dataset, endyear and varname should have the same length!")
    }
    else {
      vdata <- data.table(formula, dataset, endyear, varname, method)
      # create combinations
      comb <- vdata[, .(dataset, endyear)]
      comb <- comb[!duplicated(comb)]

      # define list to save results
      output <- list()
      # loop through object comb
      for (i in 1:nrow(comb)) {
        temp <- vdata[dataset == comb[i, dataset] & endyear == comb[i, endyear]]
        print(paste0("Extracting data from: ", comb[i, dataset], " ",
                     comb[i, endyear]))
        output[[i]] <- compute.acs(formula = temp$formula,
                                   level = level,
                                   dataset = unique(temp$dataset),
                                   endyear = unique(temp$endyear),
                                   method = temp$method,
                                   varname = temp$varname,
                                   span = span,
                                   conf.level = conf.level,
                                   one.zero = one.zero,
                                   trace = trace,
                                   data = data,
                                   format.out = format.out,
                                   file = file,
                                   us = us,
                                   region = region,
                                   division = division,
                                   state = state,
                                   county = county,
                                   county.subdivision = county.subdivision,
                                   place = place,
                                   tract = tract,
                                   block.group = block.group,
                                   msa = msa,
                                   csa = csa,
                                   necta = necta,
                                   urban.area = urban.area,
                                   congressional.district = congressional.district,
                                   state.legislative.district.upper = state.legislative.district.upper,
                                   state.legislative.district.lower = state.legislative.district.lower,
                                   puma = puma,
                                   zip.code = zip.code,
                                   american.indian.area = american.indian.area,
                                   school.district.elementary = school.district.elementary,
                                   school.district.secondary = school.district.secondary,
                                   school.district.unified = school.district.unified,
                                   combine = combine,
                                   combine.name = combine.name,
                                   print.levels = print.levels
                      )
      }
      # merge files
      if (format.out == "wide") {
        vars <- grep("sumlevel|geoid|fips", names(output[[1]]), value = TRUE)
        mydata <- Reduce(function(x, y) merge(x, y, all = TRUE, by=c(vars)), output)
      }
      else if (format.out == "long") {
        mydata <- rbindlist(output, fill = TRUE)
      }

      if (is.null(newfile)) {
        return(mydata)
      }
      else {
        fwrite(mydata, file = newfile, na='NA')
        print("Data exported to a CSV file! ")
         }
    }
} # end function

# replace missing values function
missing_thres <- -99999999
rmiss <- function(mat, miss_value = missing_thres) {
  mat[mat < miss_value] <- NA
  return(mat)
}

# individual extraction function
compute.acs <- function(formula, varname = NULL, method = NULL,  level = "state",
                    dataset = "acs", endyear = 2014, span = 5, conf.level = 0.90,
                    one.zero = TRUE, trace = TRUE, data = NULL,
                    format.out = "wide", file = NULL, print.levels = TRUE,
                    us = "*",
                    region = "*",
                    division = "*",
                    state = "WI",
                    county = "*",
                    county.subdivision ="*",
                    place ="*",
                    tract = "*",
                    block.group = "*",
                    msa = "*",
                    csa = "*",
                    necta = "*",
                    urban.area = "*",
                    congressional.district = "*",
                    state.legislative.district.upper = "*",
                    state.legislative.district.lower = "*",
                    puma = "*",
                    zip.code = "*",
                    american.indian.area = "*",
                    school.district.elementary = "*",
                    school.district.secondary = "*",
                    school.district.unified = "*",
                    combine = FALSE,
                    combine.name = "aggregate")  {


  ##################
  # initial checks
  ###################

 # if (!all(grepl("\\/|\\+|\\-", formula))) {
 #    # stop("Some formulas do not have any operator (+, - or /)")
 #    method <- rep("variables", length(varname))
 #  }

 if (any(!grepl("\\/", formula[tolower(method) %in% c("proportion", "prop", "ratio")])) )

 {
  stop("Some proportion or ratio formulas do not have the / operator!")
 }

  if (any(grepl("\\/", formula[tolower(method) %in% c("aggregation", "agg")]))) {
  stop("Some aggregation formulas do have the / operator!")
 }

  ###############################
  # definition of some variables
  ###############################
  if (length(varname) > 1 & any(duplicated(varname))) {
    stop("Duplicated varnames, please define unique varnames!")

  }

  if (length(formula) > 1 & length(method) == 1) {
    method <- rep(method, length(formula))
  }

  if (length(varname)==0 & all(method=='variable')) {
    varname <- getvars(formula)
  }

  if (length(varname)==0 & all(method!='variable')) {
    varname <- paste0("var", 1:length(formula))
    print("Specify varname to get custom variable names!")
  }

   if ((identical(length(formula), length(method)) == 0)) {
    stop("Vector of formulas and methods must have the same length!")
  }

  conf.level <- round(qnorm( (1 + conf.level) / 2), digits = 3)
  variables <- acsr::getvars(formula)

  nvars <- length(variables)
  nlevels <- length(level)

  newvars <- length(varname)

  print (paste0(". . . . . .  ACS/Census variables : ", nvars))
  print (paste0(". . . . . .  Levels : ", nlevels))
  print (paste0(". . . . . .  New variables : ", newvars))

# where I will save results
output <- list()

  #########################################
  # get data for all data (time consuming)
  #########################################

  if ( is.null(data) ) {

    ldata <- acsr::acsdata(formula = formula, level = level, dataset = dataset,
              endyear = endyear, span = span,
              us = us,
              region = region,
              division = division,
              state = state,
              county = county,
              county.subdivision = county.subdivision,
              place = place,
              tract = tract,
              block.group = block.group,
              msa = msa,
              csa = csa,
              necta = necta,
              urban.area = urban.area,
              congressional.district = congressional.district,
              state.legislative.district.upper = state.legislative.district.upper,
              state.legislative.district.lower = state.legislative.district.lower,
              puma = puma,
              zip.code = zip.code,
              american.indian.area = american.indian.area,
              school.district.elementary = school.district.elementary,
              school.district.secondary = school.district.secondary,
              school.district.unified = school.district.unified,
              combine = combine,
              combine.name = combine.name,
              print.levels = print.levels
              )

  }

  else if ( !is.null(data) ) {

    if ( !is.list(data) ) {
      stop("The data must be a list!")
    }
    else if ( is.list(data) & !( all(sapply(data, class) ==  "acs") ) )
    {
      stop("The data must contain acs objects!")
    }
    else if ( !(all(level %in% names(data)))) {
      stop("Not all levels coincide with the data")
    }
    else if ( !(all(variables %in% data[[1]]@acs.colnames)) ) {
      stop("Not all the ACS/Census variables were found, check variable names in your formulas and the dataset you are using!")
    }

  ldata <- data

  } # else if for data

  ################################
  # loop variable and level
  ################################

  print(". . . . . .  Creating variables")

  vdata <- list()

  # lower case methods
  method <- tolower(method)
  # start loop by variable
  for (v in seq_along(varname) ) {

    # create formulas from text
    constr <- gsub("\\(|\\)", "", formula[v] ) # remove parentheses
    constr <- gsub("\\* 100", "", constr) # remove * 100
    multiply <- grepl("\\* 100", formula[v]) # index multiplication by 100

    # todo: to check in the future version
    division <- grepl("\\/", constr)

    #####################
    # condition formulas
    #####################

    if (method[v] %in% c("proportion", "prop", "ratio") & division == TRUE) {

      # division (proportion or ratio)
      form <- strsplit(constr, "/") # split to get numerator and denominator

      if ( length(form[[1]]) == 1 | length(form[[1]]) > 2 ) {
        stop(paste0("There is no or multiple division operators: ", varname[v]))
      }

      # numerator
      # extract operators
      noper <- strsplit(form[[1]][1], "[aA-zZ]*[0-9]*")
      noper <- grep("\\+|\\-", noper[[1]], value = TRUE)

      # extract variables
      nume  <- strsplit(form[[1]][1], "[\\+]|[\\-]")
      nume <- gsub("[[:space:]]", "", nume[[1]])
      nume <- toupper(nume)

      # denominator
      # extract operators
      doper <- strsplit(form[[1]][2], "[aA-zZ]*[0-9]*")
      doper <- grep("\\+|\\-", doper[[1]], value=TRUE)

      # extract variables
      deno  <- strsplit(form[[1]][2], "[\\+]|[\\-]")
      deno <- gsub("[[:space:]]", "", deno[[1]])
      deno <- toupper(deno)

    }

    if (method[v] %in% c("aggregation", "agg") & division == FALSE) {

      # EXTRACT OPERATORS
      oper <- strsplit(constr, "[aA-zZ]*[0-9]*")
      oper <- grep("\\+|\\-", oper[[1]], value=TRUE)

      # EXTRACT VARIABLES
      variable  <- strsplit(constr, "[\\+]|[\\-]")
      variable <-  gsub("^\\s+|\\s+$", "", variable[[1]])
      variable <-  toupper(variable)

    }

    ###################################
    # create and compute formulas
    ###################################

    # nested level loop

    if ( combine == FALSE ) { ss <- seq_along(level) } else { ss <- 1 } # only one level

    for (l in  ss ) {

      if (combine == FALSE ) {
      dat <- ldata[[level[l]]]
      geo <- dat@geography
      }

      if ( combine == TRUE ) {
        dat <- ldata
      }

      ############################
      # only variables
      ############################

      if (method[v] == "variable")
      {
       p <- as.vector(dat@estimate[, v])
       new_error <- as.vector(dat@standard.error[, v])
       p <- ifelse(p < missing_thres, NA, p)
       new_error <- ifelse(new_error < missing_thres, NA, new_error)
      }

      ######################
      # proportion or ratios
      ######################

      if (method[v] %in% c("proportion", "prop", "ratio") & division == TRUE) {

        wn <- which(dat@acs.colnames %in% nume)
        wd <- which(dat@acs.colnames %in% deno)

        nt <- vector()
        for (i in 1:length(nume)) {
          if ( i == max(length(nume))) {
            x <- paste0("dat[,", wn[i], "]")
          }
          else {
            x <- paste0("dat[,", wn[i], "]", noper[i])
          }
          nt <- paste0(nt, x)
        }

        dt <- vector()
        for (i in 1:length(deno)) {
          if ( i == max(length(deno))) {
            x <- paste0("dat[,", wd[i], "]")
          }
          else {
            x <- paste0("dat[,", wd[i], "]", doper[i])
          }
          dt <- paste0(dt, x)
        }

        # estimates and errors
        est <- rmiss(acs::estimate(dat))
        err <- rmiss(acs::standard.error(dat))

        num <- rmiss(acs::estimate(eval(parse(text=nt))))
        den <- rmiss(acs::estimate(eval(parse(text=dt))))

        p <- num / den

        if ( method[v] %in% c("prop", "proportion") & any( na.omit(as.vector(p)) > 1))  {

          stop("The proportion is higher than 1, to use method ratio?")

        }

        # p[den == 0] <- NA

        # definition of error

        if ( length(p) == 1 ) {

          if ( one.zero == TRUE ) {

            if (length(wn) > 1) {
              err_zero_num <- max(err[, wn] ^ 2 * (est[, wn] == 0))
              err_num <- sum(err[, wn] ^ 2 * (est[, wn] != 0))
            }
            else {
              err_zero_num <- err[, wn] ^ 2 * (est[, wn] == 0)
              err_num <- err[, wn] ^ 2 * (est[, wn] != 0)
            }

            if (length(wd) > 1 ) {
              err_zero_den <- max(err[, wd] ^ 2 * (est[, wd] == 0))
              err_den <- sum(err[, wd] ^ 2 * (est[, wd] != 0))
            }
            else {
              err_zero_den <- err[, wd] ^ 2 * (est[, wd] == 0)
              err_den <- err[, wd] ^ 2 * (est[, wd] != 0)
            }

            err_num <- err_zero_num + err_num
            err_den <- err_zero_den + err_den

          }

          else if ( one.zero == FALSE ) {

            if (length(wn) > 1) {
              err_num <- sum(err[, wn] ^ 2)
            }

            else {
              err_num <- err[, wn] ^ 2
            }

            if (length(wd) > 1 ) {
              err_den <- sum(err[, wd] ^ 2)
            }
            else {
              err_den <- err[, wd] ^ 2
            }

          }

        }

        else if ( length(p) > 1 ) {

          if ( one.zero == TRUE ) {

            if (length(wn) > 1) {
              err_zero_num <- apply(err[, wn] ^ 2 * (est[, wn] == 0), 1, max)
              err_num <- apply(err[, wn] ^ 2 * (est[, wn] != 0), 1, sum)
            }
            else {
              err_zero_num <- err[, wn] ^ 2 * (est[, wn] == 0)
              err_num <- err[, wn] ^ 2 * (est[, wn] != 0)
            }

            if (length(wd) > 1 ) {
              err_zero_den <- apply(err[, wd] ^ 2 * (est[, wd] == 0), 1, max)
              err_den <- apply(err[, wd] ^ 2 * (est[, wd] != 0), 1, sum)
            }
            else {
              err_zero_den <- err[, wd] ^ 2 * (est[, wd] == 0)
              err_den <- err[, wd] ^ 2 * (est[, wd] != 0)
            }

            err_num <- err_zero_num + err_num
            err_den <- err_zero_den + err_den

          }

          else if ( one.zero == FALSE ) {

            if (length(wn) > 1) {
              err_num <- apply(err[, wn] ^ 2 , 1, sum)
            }

            else {
              err_num <- err[, wn] ^ 2
            }

            if (length(wd) > 1 ) {
              err_den <- apply(err[, wd] ^ 2, 1, sum)
            }
            else {
              err_den <- err[, wd] ^ 2
            }

          }
        }

        # computing standard errors for proportions and ratios (checked!)

        if ( method[v] %in% c("proportion", "prop") ) {

          suppressWarnings(
            new_error <- ifelse((err_num - ( p ^ 2 * err_den) ) < 0 | is.na( err_num - (p ^ 2 * err_den)),
                                sqrt(err_num + (p ^ 2 * err_den)) / den,
                                sqrt(err_num - (p ^ 2 * err_den)) / den)
          )

        }

        if (method[v] %in% c("ratio")) {
          suppressWarnings(
            new_error <-  sqrt(err_num + (p ^ 2 * err_den)) / den
          )
        }



      } # end proportion

      #################
      # aggregation
      ##################

      if (method[v] %in% c("aggregation", "agg") & division == FALSE) {

        wa <- which(dat@acs.colnames %in% variable)

        ft <- vector()
        for (i in 1:length(variable)) {

          if ( i == max(length(variable))) {
            x <- paste0("dat[,", wa[i], "]")
          }
          else {
            x <- paste0("dat[,", wa[i], "]", oper[i])
          }

          ft <- paste0(ft, x)

        }

        # estimates and errors

        est <- rmiss(acs::estimate(dat))
        err <- rmiss(acs::standard.error(dat))
        p  <-  rmiss(acs::estimate(eval(parse(text = ft))))

        # one zero  computation

        if ( length(p) == 1 ) {

          if ( one.zero == TRUE ) {

            if (length(wa) > 1) {
              err_zero_agg <- max(err[, wa] ^ 2 * (est[, wa] == 0))
              err_agg <- sum(err[, wa] ^ 2 * (est[, wa] != 0))
            }
            else {
              err_zero_agg <- err[, wa] ^ 2 * (est[, wa] == 0)
              err_agg <- err[, wa] ^ 2 * (est[, wa] != 0)
            }

            err_agg <- err_zero_agg + err_agg

          }

          else if ( one.zero == FALSE ) {

            if (length(wa) > 1) {
              err_agg <- sum(err[, wa] ^ 2)
            }
            else {
              err_agg <- err[, wa] ^ 2
            }

          }

        }

        else if ( length(p) > 1 ) {

          if ( one.zero == TRUE ) {

            if (length(wa) > 1) {
              err_zero_agg <- apply(err[, wa] ^ 2 * (est[, wa] == 0), 1, max)
              err_agg <- apply(err[, wa] ^ 2 * (est[, wa] != 0), 1, sum)
            }
            else {
              err_zero_agg <- err[, wa] ^ 2 * (est[, wa] == 0)
              err_agg <- err[, wa] ^ 2 * (est[, wa] != 0)
            }

            err_agg <- err_zero_agg + err_agg

          }

          else if ( one.zero == FALSE ) {

            if (length(wa) > 1) {
              err_agg <- apply(err[, wa] ^ 2 , 1, sum)
            }
            else {
              err_agg <- err[, wa] ^ 2
            }

          }
        }

        # computing standard errors for aggregation
        new_error <- sqrt(err_agg)

      } # end aggregation

      #######################
      # create datasets (one by one!)
      #######################

 if ( combine == FALSE ) {

 if (level[l] == "us") {

          if (multiply == TRUE)
            { cmoe <-  as.vector(new_error * conf.level * 100)
              if (method[v] %in% c("prop", "proportion"))
                { cmoe[cmoe > 100] <- 100 }
            }
         else { cmoe <-  as.vector(new_error * conf.level)
              if (method[v] %in% c("prop", "proportion"))
                { cmoe[cmoe > 1] <- 1 }
            }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = as.character(NA),
          sumlevel = "010",
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

 if (level[l] == "region") {

        if (multiply == TRUE)
          { cmoe <-  as.vector(new_error * conf.level * 100)
            if (method[v] %in% c("prop", "proportion"))
              { cmoe[cmoe > 100] <- 100 }
          }
       else { cmoe <-  as.vector(new_error * conf.level)
            if (method[v] %in% c("prop", "proportion"))
              { cmoe[cmoe > 1] <- 1 }
          }

        vdata[[ paste0(v,l) ]] <- data.table(
          geoid = as.character(NA),
          sumlevel = "020",
          region = geo$region,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

 if (level[l] == "division") {

          if (multiply == TRUE)
            { cmoe <-  as.vector(new_error * conf.level * 100)
              if (method[v] %in% c("prop", "proportion"))
                { cmoe[cmoe > 100] <- 100 }
            }
         else { cmoe <-  as.vector(new_error * conf.level)
              if (method[v] %in% c("prop", "proportion"))
                { cmoe[cmoe > 1] <- 1 }
            }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = as.character(NA),
          sumlevel = "030",
          division = geo$division,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "state") {

        if (multiply == TRUE)
          { cmoe <-  as.vector(new_error * conf.level * 100)
            if (method[v] %in% c("prop", "proportion"))
              { cmoe[cmoe > 100] <- 100 }
          }
       else { cmoe <-  as.vector(new_error * conf.level)
            if (method[v] %in% c("prop", "proportion"))
              { cmoe[cmoe > 1] <- 1 }
          }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = sprintf("%02d", as.numeric(geo$state)),
          sumlevel = "040",
          st_fips = geo$state,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "county") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%03d", as.numeric(geo$county))),
          sumlevel = "050",
          st_fips = geo$state,
          cnty_fips = geo$county,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "county.subdivision") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%03d", as.numeric(geo$county)), sprintf("%05d", as.numeric(geo$countysubdivision))),
          sumlevel = "060",
          st_fips = geo$state,
          cnty_fips = geo$county,
          cnty_sub_fips = geo$countysubdivision,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }


      if (level[l] == "tract") {

           if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%03d", as.numeric(geo$county)), sprintf("%06d", as.numeric(geo$tract))),
          sumlevel = "140",
          st_fips = geo$state,
          cnty_fips = geo$county,
          tract_fips = geo$tract,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "block.group") {

           if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(geo$state, sprintf("%03d", as.numeric(geo$county)), sprintf("%06d", as.numeric(geo$tract)), as.numeric(geo$blockgroup)),
          sumlevel = "150",
          st_fips = geo$state,
          cnty_fips = geo$county,
          tract_fips =  geo$tract,
          block_group = geo$blockgroup,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "place") {

           if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%05d", as.numeric(geo$place))),
          sumlevel = "160",
          st_fips = geo$state,
          place = geo$place,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "american.indian.area") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = as.character(NA),
          sumlevel = "250",
          indian_area = geo$americanindianarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

    if (level[l] == "puma") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state))),
          sumlevel = "795",
          st_fips = geo$state,
          puma = geo$publicusemicrodataarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }


        if (level[l] == "msa") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state))),
          sumlevel = "320",
          st_fips = geo$stat,
          msa = geo$metropolitanstatisticalareamicropolitanstatisticalarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

        if (level[l] == "csa") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state))),
          sumlevel = "340",
          st_fips = geo$state,
          csa = geo$combinedstatisticalarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

        if (level[l] == "necta") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = as.character(NA),
          sumlevel = "350",
          necta = geo$newenglandcityandtownarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

        if (level[l] == "urban.area") {

          if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = as.character(NA),
          sumlevel = "400",
          urban_area = geo$urbanarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }


      if (level[l] == "congressional.district") {

            if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%02d", as.numeric(geo$congressionaldistrict))),
          sumlevel = "500",
          st_fips = geo$state,
          cong_dist = geo$congressionaldistrict,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }


      if (level[l] == "state.legislative.district.upper") {

           if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%03d", as.numeric(geo$statelegislativedistrictupper))),
          sumlevel = "610",
          st_fips = geo$state,
          leg_dist_upper = geo$statelegislativedistrictupper,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

    if (level[l] == "state.legislative.district.lower") {

           if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%03d", as.numeric(geo$statelegislativedistrictlower))),
          sumlevel = "620",
          st_fips = geo$state,
          leg_dist_lower = geo$statelegislativedistrictlower,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }


      if (level[l] == "zip.code") {

            if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = as.character(NA),
          sumlevel = "860",
          zip = geo$zipcodetabulationarea,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "school.district.elementary") {

           if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%05d", as.numeric(geo$schooldistrictelementary))),
          sumlevel = "950",
          st_fips = geo$state,
          sch_dist_ele = geo$schooldistrictelementary,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "school.district.secondary") {

            if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

        vdata[[ paste0(v,l) ]]  <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%05d", as.numeric(geo$schooldistrictsecondary))),
          sumlevel = "960",
          st_fips = geo$state,
          sch_dist_sec = geo$schooldistrictsecondary,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

      if (level[l] == "school.district.unified") {

            if (multiply == TRUE)
              { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
           else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
              }

         vdata[[ paste0(v,l) ]] <- data.table(
          geoid = paste0(sprintf("%02d", as.numeric(geo$state)), sprintf("%05d", as.numeric(geo$schooldistrictunified))),
          sumlevel = "970",
          st_fips = geo$state,
          sch_dist_uni = geo$schooldistrictunified,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
      }

    } # end level loop
 } # end combine FALSE

 if (combine == TRUE ) {

   vdata <- data.table()

   if (multiply == TRUE)
          { cmoe <-  as.vector(new_error * conf.level * 100)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 100] <- 100 }
              }
    else { cmoe <-  as.vector(new_error * conf.level)
                if (method[v] %in% c("prop", "proportion"))
                  { cmoe[cmoe > 1] <- 1 }
            }

         vdata <- data.table(
          geoid = as.character(NA),
          combined_group = combine.name,
          var_name = varname[v],
          est = if (multiply == TRUE) { as.vector(p * 100)} else { as.vector(p)},
          moe = cmoe
        )
 }

  if ( trace ) {
  print(paste0(". . . . . .  ", round( v / newvars * 100, 1), "%"))
  }

  } # end variable loop

  print(". . . . . .  Formatting output")

  if ( combine == FALSE ) {

    mdata <- rbindlist(vdata, fill = TRUE)

  }

  if ( combine == TRUE ) {

   mdata <- copy(vdata)

  }

if (format.out == "long") {

  fdata <- copy(mdata)

  }

else if (format.out == "wide") {

  # dcast variables
  namesformula <- grep("var_name|est|moe", names(mdata), value = TRUE, invert = TRUE)
  ff <- paste(paste(namesformula, collapse = " + "), " ~ var_name")
  wdata <- data.table::dcast(mdata, formula(ff), value.var = c("est", "moe"))

  # awful way to deal with names (to improve!)
  vnames <- sort(unique(mdata$var_name))

  ids <- c( "sumlevel", "geoid", "region","division","st_fips","cnty_fips","cnty_sub_fips","tract_fips","block_group","place","indian_area","msa","csa","necta","urban_area","cong_dist","leg_dist_upper","leg_dist_lower","puma","zip","sch_dist_ele","sch_dist_sec","sch_dist_uni", "combined_group")

  ids <- ids[ids %in% names(wdata)]

  a <- wdata[, grep("_moe|moe_", names(wdata), invert = TRUE), with = FALSE]
  b <- wdata[, grep("_est|est_", names(wdata), invert = TRUE), with = FALSE]
  setnames(a, names(a), gsub("_est|est_", "", names(a)))
  setnames(b, names(b), gsub("_moe|moe_", "", names(b)))
  a <- a[, sort(names(a[,!ids, with = FALSE])), with = FALSE]
  b <- b[, sort(names(b[,!ids, with = FALSE])), with = FALSE]

  setnames(a, names(a), paste0(names(a), "_est"))
  setnames(b, names(b), paste0(names(b), "_moe"))
  wdata <- cbind(wdata[, ids, with = FALSE], a, b)
  vars <- sort(names(wdata[, !ids, with = FALSE]))

  # # old code
  # vars  <- names(wdata)
  # setnames(wdata, names(wdata), gsub("_est", "", vars))

  # ids <- c("stfid","sumlevel","st_fips","cnty_fips","cnty_sub_fips","tract_fips","block.group","cong_dist","sch_dist_sec","sch_dist_ele")

  # vars  <- names(wdata[,!ids, with = FALSE])
  # vars <- sort(vars)

  # final dataset
  fdata <- wdata[, c(ids, vars), with = FALSE]

  if (combine == FALSE ) {
  data.table::setkey(fdata, sumlevel)
  }

  }

  # clean names using census data
   if (dataset == "sf1" | dataset == "sf3") {
    dnames <- grep("moe", names(fdata), value = TRUE)
    fdata <- fdata[, -dnames, with = FALSE]
    enames <- gsub("_est", "", names(fdata))
    setnames(fdata, names(fdata), enames)
  }

  # lower names of variables
    dnames <- names(fdata)
    setnames(fdata, dnames, tolower(dnames))

  # write csv
    if (!is.null(file)) {
        fwrite(fdata, file = file, na = 'NA')
        print(". . . . . .  Data exported to a CSV file! ")
      }

  return(fdata)

} # end function
sdaza/acsr documentation built on June 18, 2020, 6:53 p.m.