DemoTools: Standardize, Evaluate, and Adjust Demographic Data

Documented in surv10 surv5 survN survRatioError

###############################################################################
#' Census survival estimation
#'
#' @description This function family implements the spreadsheets from the UN repository
#' "Adult Mortality Estimation Files/countries/JAPAN/CENSUS~1.SUR", prepared by Griffith Feeney ca 2007.
#'
#' @param pop1 numeric. Vector of population counts in 5-year age groups of census 1.
#' @param pop2 numeric. Vector of population counts in 5-year age groups of census 2.
#' @param Age numeric. Vector of ages corresponding to the lower integer bound of the counts.
#' @param date1 date. Date of the first census. See details for ways to express it.
#' @param date2 date. Date of the second census. See details for ways to express it.
#' @param exprior numeric. Vector of remaining life expectancy (not e(0), see details) for same population and
#' time but from a different source.
#' @param OAG logical. Whether or not the final age group is open. Default \code{TRUE}.

#' @details The lengths of \code{pop1} and \code{pop2} should match. We assume that the last age group is open, and throw it out. If your last age group is not open then specify \code{OAG = FALSE}. Dates can be given in three ways 1) a \code{Date} class object, 2) an unambiguous character string in the format \code{"YYYY-MM-DD"}, or 3) as a decimal date consisting in the year plus the fraction of the year passed as of the given date. Value for life expectancy at birth e(0) in \code{exprior} should be set to \code{NA}.
#' @references
#' \insertRef{united1955manual}{DemoTools}
#' \insertRef{united1983manual}{DemoTools}
#' @return The median absolute percent error of the census survival remaining life expectancy vector compared with the external remaining life expectancy estimate (\code{exprior}).
#' @export
#' @importFrom stats median
#' @examples
#'# Based on Feeney's spreadsheets CENSUR~1.XLS, CENSUR~2.XLS, and CENSUR~3.XLS
#'# this is census data from Japan
#'
#' # create a bunch of data to show the three variants.
#' pop1960 <- c(3831870,4502304,5397061,4630775,4193184,4114704,3770907,3274822,
#'    2744786,2559755,2160716,1839025,1494043,1133409,870238,577972,313781,131547)
#' pop1965 <- c(3983902,3854281,4513237,5373547,4572392,4206801,4110076,3751030,
#'    3231736,2697217,2485095,2071540,1719370,1343444,955567,644043,341170,176068)
#' pop1970 <- c(4292503,3988292,3852101,4492096,5347327,4571868,4190340,4085338,
#'    3674127,3198934,2648360,2382691,1970485,1584699,1172155,736258,408191,53116)
#' Age          <- seq(0, 85, by = 5)
#' date1960     <- "1960-10-01"
#' date1965     <- "1965-10-01"
#' date1970     <- "1970-10-01"
#' date1960fake <- "1960-10-05"
#' ex1  <- c(NA,69.45,64.62,59.72,54.87,50.11,45.37,40.65,35.99,
#' 31.40,26.94,22.64,18.52,14.67,11.20,8.25,5.95,6.00)
#' ex2  <- c(NA,70.19,65.33,60.41,55.54,50.74,45.96,41.21,
#' 36.52,31.89,27.39,23.05,18.89,14.99,11.45,8.43,6.06)
#'
#' # reproduces CENSUR~1.XLS
#' # this method called under the hood when censuses are a clean 10 years apart
#' (s10 <- survRatioError(
#'  pop1 = pop1960,
#'  pop2 = pop1970,
#'  Age = Age,
#'  date1 = date1960,
#'  date2 = date1970,
#'  exprior = ex1)) #.44
#'
#' # reproduces CENSUR~3.XLS
#' # this method called under the hood when censuses are a clean 5 years apart
#' (s5 <- survRatioError(
#' 		pop1 = pop1965,
#' 		pop2 = pop1970,
#' 		Age = Age,
#' 		date1 = date1965,
#' 		date2 = date1970,
#' 		exprior = ex2)) #.40
#'
#' # reproduces CENSUR~2.XLS (if indeed censuses are oddly spaced)
#' # this method called under the hood when censuses are not spaced
#' # a clean 5 or 10 years apart.
#' (sn <- survRatioError(
#' 		pop1 = pop1960,
#' 		pop2 = pop1970,
#' 		Age = Age,
#' 		date1 = date1960fake,
#' 		date2 = date1970,
#' 		exprior = ex1)) # 1.067818
#'
# result matches CENSUR~2 exactly if you change the
# decimal dates used in the spreadsheet to the following
# print(dec.date(date1960fake),digits=15) # date1
# print(dec.date(date1970),digits=15)     # date2
# can't call up the synthetic method on the original dates
# because it gets automatically directed to the cleaner 10-year
# staggered method, which is preferred.

survRatioError <-
  function(pop1, pop2, Age, date1, date2, exprior, OAG = TRUE) {
    stopifnot(all.equal(length(pop1), length(pop2), length(Age)))
    
    # census spacing used to stagger ages.
    date1   <- dec.date(date1)
    date2   <- dec.date(date2)
    # calculate intercensal period
    int     <- date2 - date1
    
    # TR: add chunk: control age grouping here:
    if (missing(Age)) {
      Age     <- names2age(pop1, pop2)
    }
    
    pop1    <- groupAges(Value = pop1, Age = Age, N = 5)
    pop2    <- groupAges(Value = pop2, Age = Age, N = 5)
    Age     <- names2age(pop1)
    
    N       <- length(pop1)
    if (OAG) {
      OAvalue1    <- pop1[N]
      OAvalue2    <- pop1[N]
      OAi         <- N
      N           <- N - 1
      pop1        <- pop1[-OAi]
      pop2        <- pop2[-OAi]
      Age         <- Age[-OAi]
    }
    
    # this
    if (abs(int - 5) < .01) {
      # are we super close to 5 years apart?
      lx <- surv5(pop1, pop2)
    } else {
      if (abs(int - 10) < .01) {
        # are we super close to 10 years apart?
        lx <- surv10(pop1, pop2)
      } else {
        # otherwise use the synthetic method
        lx <- survN(pop1, pop2, int)
      }
    }
    # all remaining columns should be identical.
    
    # Lx approximated as avg of lx
    Lx         <- c(NA, (lx[-length(lx)] + lx[-1])/2)
    # 2.5 times the pairwise sums of that to get the 5-year Lx approximation
    #Lx5        <- c(2.5 * (Lx[-length(Lx)] + Lx[-1]))
    Lx5        <- ((Lx[-length(Lx)] + Lx[-1])/2)[1:N] * 5
    Lx5[N]     <- Lx[N] * exprior[N]
    
    # cut to size
    Lx         <- Lx[1:N]
    Lx5        <- Lx5[1:N]
    exprior    <- exprior[1:N]
    
    Tx         <- lt_id_L_T(Lx5)
    ex.est     <- Tx / Lx
    dif        <- exprior - ex.est
    abserror   <- 100 * abs(dif / exprior)
    abserror[N] <- median(abserror[-N], na.rm = TRUE)
    
    # return the median absolute percent error
    median(abserror, na.rm = TRUE)
  }

#' Estimate survival curve from censuses spaced 10 years apart.
#'
#' @description This function reproduces calculations through column H of \code{CENSUR~1.XLS} by Griff Feeney. Censuses are assumed to be spaced 10 years apart and population counts for both censuses in 5-year age groups. The staggered ratio of these approximates lifetable function npx =1-nqx, i.e. probability of surviving between age x and age x+n. The cumulative product of this approximates the survival function lx.
#'
#' @param pop1 numeric. Vector of population counts in 5-year age groups of census 1.
#' @param pop2 numeric. Vector of population counts in 5-year age groups of census 2.
#' @return Survival function lx as a numeric vector with radix 1.
#' @details Checking for census spacing of 10 years must happen prior to this function.
#' Also, we assume the open age group has already been trimmed off.
#' @export

#' @examples
#'# 1960 vs 1970 pops
#' pop1 <- c(3831870,4502304,5397061,4630775,4193184,4114704,3770907,3274822,
#' 		2744786,2559755,2160716,1839025,1494043,1133409,870238,577972,313781)
#' pop2 <- c(4292503,3988292,3852101,4492096,5347327,4571868,4190340,4085338,
#' 		3674127,3198934,2648360,2382691,1970485,1584699,1172155,736258,408191)
#' surv10(pop1,pop2)

surv10 <- function(pop1, pop2) {
  N       <- length(pop1)
  stagger <- 2
  ratio   <- pop2[-c(1:stagger)] / pop1[1:(N - stagger)]
  
  NN       <- length(ratio)
  sx1     <- 1:NN %% 2 == 1
  sx2     <- 1:NN %% 2 == 0
  lx1     <- c(1, cumprod(ratio[sx1]))
  lx2     <- mean(lx1[1:2]) * c(1, cumprod(ratio[sx2]))
  
  # at this point lx vectors can be of different lengths.
  # need to be of same length for interleaving trick to work.
  N1 <- length(lx1)
  N2 <- length(lx2)
  if (N2 < N1) {
    lx2 <- c(lx2, rep(NA, N1 - N2))
  }
  # use object trick to interleave
  lx      <- c(rbind(lx1, lx2))
  lx[1:N]
}

#' Estimate survival curve from censuses spaced 5 years apart.
#'
#' @description This function reproduces calculations of \code{CENSUR~3.XLS} by Griff Feeney. Censuses are assumed to be spaced 5 years apart and population counts for both censuses in 5-year age groups. The staggered ratio of these approximates lifetable function npx =1-nqx, i.e. probability of surviving between age x and age x+n. The cumulative product of this approximates the survival function lx.
#'
#' @param pop1 numeric. Vector of population counts in 5-year age groups of census 1.
#' @param pop2 numeric. Vector of population counts in 5-year age groups of census 2.
#' @return Survival function lx as a numeric vector with radix 1.
#' @details Checking for census spacing of 5 years must happen prior to this function. Also, we assume the open age group has already been trimmed off.
#' @export

#' @examples
#'# 1965 vs 1970 pops
#' pop1 <- c(3983902,3854281,4513237,5373547,4572392,4206801,4110076,3751030,
#' 3231736,2697217,2485095,2071540,1719370,1343444,955567,644043,341170)
#' pop2 <- c(4292503,3988292,3852101,4492096,5347327,4571868,4190340,4085338,
#' 3674127,3198934,2648360,2382691,1970485,1584699,1172155,736258,408191)
#' surv5(pop1,pop2)
#' \dontrun{
#'plot(seq(0,80,5),surv5(pop1,pop2), t ='l', col='red',
#'     ylim = c(0,1),
#'     ylab = 'Survival function', xlab = 'Age')
#' }
surv5 <- function(pop1, pop2) {
  N       <- length(pop1)
  stagger <- 1
  ratio   <- pop2[-c(1:stagger)] / pop1[1:(N - stagger)]
  lx      <- cumprod(c(1, ratio))
  lx
}

#' Estimate survival curve from censuses spaced N years apart.
#'
#' @description This function reproduces calculations of \code{CENSUR~2.XLS} by Griff Feeney. Censuses are assumed to be spaced N years apart and population counts for both censuses in 5-year age groups. The staggered ratio of these approximates lifetable function npx =1-nqx, i.e. probability of surviving between age x and age x+n. The cumulative product of this approximates the survival function lx.
#'
#' @param pop1 numeric. Vector of population counts in 5-year age groups of census 1.
#' @param pop2 numeric. Vector of population counts in 5-year age groups of census 2.
#' @param interval numeric. Length of the inter-censal period.
#'
#' @return Survival function lx as a numeric vector with radix 1.
#' @details Checking for census spacing of N years must happen prior to this function. Also, we assume the open age group has already been trimmed off.
#' @export
#' @details We assume the open age group has already been trimmed off. The time interval `N`` must be measured as a decimal in advance and specified. This method uses a synthetic approximation of person-years lived in each age interval over the intercensal period and then a second approximation based on age-specific growth rates to produce an estimate of lifetable \eqn{p(x)}. This value of \eqn{p(x)} is not bounded to \eqn{[0,1]}, and therefore the resulting lx approximation is not strictly positive or monotonically non-increasing, so downstream usage of this result is limited. For example, is not advisable to use it in the standard way to derive the lifetable dx.
#'
#' @export

#' @examples
#'# 1960 vs 1970 pops
#' pop1 <- c(3831870,4502304,5397061,4630775,4193184,4114704,3770907,3274822,
#' 		2744786,2559755,2160716,1839025,1494043,1133409,870238,577972,313781)
#' pop2 <- c(4292503,3988292,3852101,4492096,5347327,4571868,4190340,4085338,
#' 		3674127,3198934,2648360,2382691,1970485,1584699,1172155,736258,408191)
#' interval <- 10
#' survN(pop1,pop2,interval)
#' \dontrun{
#' plot(seq(0,80,5),survN(pop1,pop2,interval), t ='l', col='red',
#'     ylim = c(0,1.1),
#'     ylab = 'Survival function', xlab = 'Age')
#' }

survN <- function(pop1, pop2, interval) {
  r   <- log(pop2 / pop1) / interval
  PYL <- (pop2 - pop1) / (interval * r)
  Sx  <-
    (PYL[-1] * exp(2.5 * r[-1])) / (PYL[-length(PYL)] * exp(-2.5 * r[-length(r)]))
  lx  <- cumprod(c(1, Sx))
  lx
}