require(Matrix)
require(splines)
require(MortalitySmooth)
# pclm.control ---------------------------------------------------------------
#' Auxiliary for controlling PCLM fitting
#'
#' @description
#' Auxiliary function for controlling PCLM fitting. Use this function to set control
#' parameters of the \code{\link{pclm.default}} and other related functions.
#'
#' @param x.div Number of sub-classes within PCLM tim/age class (default is 1).
#' Low value of the parameter makes the PCLM computation faster. It is however recommended to set
#' it to higher value (e.g. 10) for better \code{nax} estimates.
#' @param x.auto.trans Logical indicating if automatically multiple age intervals to remove fractions.
#' \code{TRUE} is the recommended value. See also examples in \code{\link{pclm.default}}.
#' @param x.max.ext Integer defining maximal multiple of an age interval. See also \code{\link{pclm.interval.multiple}}.
#' @param zero.class.add Logical indicating if additional zero count class (open interval)
#' should be added after last age class. \code{TRUE} is the recommended value. See \code{\link{pclm.nclasses}} and \code{\link{pclm.compmat}}.
#' @param zero.class.end Positive indicating the end of zero count class = anticipated end of
#' last (open) interval. If set to \code{NULL} and \code{zero.class.add == TRUE} then it is calculated
#' automatically based on \code{zero.class.frac}. See \code{\link{pclm.nclasses}} and \code{\link{pclm.compmat}}.
#' @param zero.class.frac Fraction of total range of \code{x} (age/time vector) added as the last zero-count interval
#' when \code{zero.class.end == NULL}. Used in \code{\link{pclm.compmat}}. Increase this value if the right tail
#' of the PCLM fit is badly fitted (use \code{\link{plot.pclm}} to diagnose).
#' @param bs.use Logical indicating if use B- or P-spline basis to speed-up computations.
#' Possible values: \code{"auto"}, \code{TRUE}, and \code{FALSE}. Used by \code{\link{pclm.compmat}} function.
#' @param bs.method Basis for B- or P-spline used by \code{\link{pclm.compmat}} function.
#' Possible values:
#' \itemize{
#' \item{\code{"MortalitySmooth"}}{ - gives "P-splines" basis based on \code{MortSmooth_bbase} of
#' \code{MortalitySmooth} package (recommended)}
#' \item{\code{"bs"}}{ - gives basic B-splines basis based on \code{\link{bs}} \code{\link{splines}}.}
#' }
#' @param bs.df B- or P- spline degree of freedom (df, number of inner knots)
#' or a way to its calculation used in \code{\link{pclm.compmat}} function.
#' The value is automatically limited by the \code{bs.df.max}.
#' It can take corresponding values:
#' \itemize{
#' \item{\code{"maxprec"}}{ - df equal to the number of ungrouped (raw) age classes (recommended option).}
#' \item{\code{"thumb"}}{ - 'rule of thumb': one knot for the B-spline basis each 4-5 observations.}
#' \item{\code{integer}}{ - df given explicitly.}
#' }
#' @param bs.df.max Maximal number of knots (df) for B- or P-spline basis.
#' Defaut value is 250, but can be decreased if computations are slow.
#' Used in \code{\link{pclm.compmat}}.
#' @param bs.deg Degree of the piecewise polynomial for B- or P-spline basis.
#' Default and recommended value is 3. Used in \code{\link{pclm.compmat}}.
#' @param opt.method Selection criterion for \code{lambda} (smooth parameter) in
#' \code{\link{pclm.opt}} function. Possible values are \code{"AIC"} and \code{"BIC"} (recommended).
#' @param opt.tol Tolerance for \code{\link{pclm.opt}} function that estimates smooth
#' parameter \code{lambda}.
#' @param pclm.deg Order of differences of the components of \code{b} (PCLM coeficients, beta in the reference [1]).
#' Default value is 2, some other values may cause algorithm to not
#' work properly. Used by \code{\link{pclm.core}} function.
#' @param pclm.max.iter Maximal number of iterations in \code{\link{pclm.core}}
#' function. Default is 100, but increase it when got a warning.
#' @param pclm.lsfit.tol Tolerance for \code{\link{lsfit}} function used
#' in \code{\link{pclm.core}} function.
#' @param pclm.tol Tolerance for \code{\link{pclm.core}} function.
#' @return List with control parameters.
#' @seealso \code{\link{pclm.default}}, \code{\link{pclm.core}},
#' \code{\link{pclm.opt}}, \code{\link{pclm.aggregate}}, \code{\link{pclm.compmat}},
#' \code{\link{pclm.interval.multiple}}, \code{\link{pclm.nclasses}},
#' \code{\link{plot.pclm}}, and \code{\link{summary.pclm}}.
#' @references
#' \enumerate{
#' \item{Rizzi S, Gampe J, Eilers PHC. Efficient estimation of smooth distributions from
#' coarsely grouped data. Am J Epidemiol. 2015;182:138?47.}
#' }
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @export
pclm.control<-function(x.div = 1L,
x.auto.trans = TRUE,
x.max.ext = 25L,
zero.class.add = TRUE,
zero.class.end = NULL,
zero.class.frac = 0.2,
bs.use = 'auto',
bs.method = c('MortalitySmooth', 'bs'),
bs.df = c('maxprec', 'thumb'),
bs.df.max = 250L,
bs.deg = 3L,
opt.method = c('BIC','AIC'),
opt.tol = .Machine$double.eps^0.5,
pclm.deg = 2L,
pclm.max.iter = 100L,
pclm.lsfit.tol = .Machine$double.eps^0.5,
pclm.tol = .Machine$double.eps^0.5){
if (!(opt.method[1] %in% c('BIC','AIC'))) stop ('"AIC" or "BIC" should be used for opt.method')
if (!(bs.df[1] %in% c('maxprec','thumb'))) if (!is.numeric(bs.df[1]))
stop('bs.df can take "maxprec", "thumb", or any positive integer value')
if (!(bs.method[1] %in% c('MortalitySmooth', 'bs'))) stop ('"MortalitySmooth" or "bs" can only be used for bs.method')
list(x.max.ext = x.max.ext, x.auto.trans = x.auto.trans, x.div = x.div, zero.class.add = zero.class.add, zero.class.end = zero.class.end,
zero.class.frac = zero.class.frac, bs.use = bs.use, bs.method = bs.method[1], bs.df = bs.df[1], bs.df.max = bs.df.max, bs.deg = bs.deg,
opt.method = opt.method[1], opt.tol = opt.tol, pclm.deg = pclm.deg, pclm.max.iter = pclm.max.iter, pclm.lsfit.tol = pclm.lsfit.tol,
pclm.tol = pclm.tol)
}
# pclm.interval.multiple ---------------------------------------------------------------
#' Multiple for the original age/time interval length
#' @description
#' Calculates minimal multiple of the orginal age/time interval length to remove fractions in \code{x}.
#' The function (together with \code{x} and some control parameters) is used to calculate the
#' internal (raw, nonaggregated) interval length during PCLM computations.
#' @param x Vector with start of the interval for age/time classes.
#' @param control List with additional parameters. See \code{\link{pclm.control}}.
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @seealso \code{\link{pclm.default}}, \code{\link{pclm.control}}, and \code{\link{pclm.nclasses}}.
#' @export
pclm.interval.multiple <- function(x, control = list()) {
control <- do.call("pclm.control", control)
for (j in 1:(control$x.max.ext)){
y <- (j * x) %% 1
if (all(y == 0)) break
}
j
}
# pclm.nclasses -----------------------------------------------
#' Calculate the number of PCLM internal (raw) classes
#'
#' @description
#' Calculate the number of PCLM internal (raw) classes.
#' @param x Vector with start of the interval for age/time classes.
#' @param control List with additional parameters. See \code{\link{pclm.control}}.
#' @examples
#' \dontrun{
#' # Use a simple data set
#' x <- c(0, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
#' dx <- c(38, 37, 17, 104, 181, 209, 452, 1190, 2436, 3164, 1852, 307, 13)
#'
#' # Define the open interval by zero.class.frac
#' control.1 = list(x.div = 5, zero.class.frac = 0.2, zero.class.end = NULL)
#' pclm.nclasses(x, control = control.1) #calculate number of raw classes
#' AU10p.1A <- pclm.default(x, dx, control = control.1)
#' length(AU10p.1A$raw$x) # the number of raw classes after fit
#' plot(AU10p.1A)
#'
#' # Define the open interval by zero.class.end
#' control.2 = list(x.div = 5, zero.class.end = 109)
#' pclm.nclasses(x, control = control.2) #calculate the number of raw classes
#' AU10p.1B <- pclm.default(x, dx, control = control.2)
#' length(AU10p.1B$raw$x) # the number of raw classes after fit
#' plot(AU10p.1B)
#
#' }
#' @seealso \code{\link{pclm.control}}, \code{\link{pclm.interval.multiple}},
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @export
pclm.nclasses<-function(x, control = list()) {
control <- do.call("pclm.control", control)
if (control$zero.class.add)
if (length(control$zero.class.end) == 0){
drx <- diff(range(x))
tmp <- control$x.div * drx * pclm.interval.multiple(x, control)
tmp <- tmp * (1 + control$zero.class.frac)
tmp <- tmp + 1
} else {
drx <- diff(range(x, control$zero.class.end))
tmp <- control$x.div * drx * pclm.interval.multiple(x, control)
tmp <- tmp + 1
}
tmp
}
# pclm.compmat -------------------------------------------------------------------------
#' Create composition matrix object
#'
#' @description
#' Construct the composition matrix object for automatically recalibrated age classes. \cr\cr \emph{\bold{The internal function}}.
#'
#' @param x Vector with start of the interval for age/time classes. x * x.div must be an integer.
#' The appropriate correction for fractional intervals based on the interval multiple (\code{\link{pclm.interval.multiple}}) is performed in \code{\link{pclm.default}}.
#' @param y Vector with counts, e.g. \code{ndx}. It must have the same length as \code{x}.
#' @param exposures Vector with exposures used to calculate smoothed mortality rates (see reference [1] and \code{\link{pclm.default}}).
#' @param control List with additional parameters. See \code{\link{pclm.control}}.
#' @return List with components:
#' @return \item{\code{C}}{ Composition matrix.}
#' @return \item{\code{X}}{ B-spline base, P-spline base, or identity matrix.}
#' @return \item{\code{x}}{ Corrected age/time vector.}
#' @return \item{\code{y}}{ Corrected vector with counts.}
#' @return \item{\code{open.int.len}}{ Length of the open interval in age classes.}
#' @return \item{\code{exposures}}{ Vector with exposures if it was used to construct the composition matrix.}
#' @return \item{\code{control}}{Used control parameters, see \code{\link{pclm.control}}.}
#' @return \item{\code{warn.list}}{ List with warnings.}
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @details
#' The details of matrix construction can be found in reference [1].
#' if \code{bs.use == TRUE} then P- or B- splines are used instead of identity matrix (see \code{\link{pclm.control}}).\cr\cr
#' The dimension of constructed composition matrix can be determined before its computation.
#' The shorter dimension equals to the length of data vector + 1, whereas the longer dimension is
#' determined by the function \code{\link{pclm.nclasses}} and for \code{zero.class.end == NULL} equals:\cr\cr
#' \code{(x.div * (max(x) - min(x)) * m) * (1 + zero.class.frac) + 1}\cr\cr
#' or\cr\cr
#' \code{x.div * (zero.class.end - min(x)) * m + 1}\cr\cr
#' otherwise,
#' where \code{m} is an interval multiple calculated by \code{\link{pclm.interval.multiple}}.
#' See also \code{\link{pclm.nclasses}}.
#' @references
#' \enumerate{
#' \item{Rizzi S, Gampe J, Eilers PHC. Efficient estimation of smooth distributions from coarsely grouped data. Am J Epidemiol. 2015;182:138?47.}
#' \item{Camarda, C. G. (2012). MortalitySmooth: An R Package for Smoothing Poisson Counts with P-Splines. Journal of Statistical Software. 50, 1-24.}
#' \item{Hastie, T. J. (1992) Generalized additive models. Chapter 7 of Statistical Models in S eds J. M. Chambers and T. J. Hastie, Wadsworth & Brooks/Cole.}
#' }
#' @seealso \code{\link{pclm.default}}, \code{\link{pclm.control}}, \code{\link{pclm.interval.multiple}}, and \code{\link{pclm.nclasses}}.
#' @importFrom Matrix sparseMatrix
#' @importFrom splines bs
#' @keywords internal
pclm.compmat<-function(x, y, exposures = NULL, control = list()){
control <- do.call("pclm.control", control)
if (control$bs.use == 'auto') {
control$bs.use <- (pclm.nclasses(x, control) >= control$bs.df.max)
message(paste('bs.use automatically set to', control$bs.use))
} else if (!is.logical(control$bs.use)) stop('bs.use can take only "auto", TRUE or FALSE values.')
if ((pclm.nclasses(x, control) >= control$bs.df.max) && (control$bs.use == FALSE)) warning(immediate. = TRUE, 'Big composition matrix detected. Calculations can be slow. Set "bs.use" to TRUE.')
.MortSmooth_Bbase<-function (x, xl = min(x), xr = max(x), df = floor(length(x) / 4), deg = control$bs.deg) {
#Modified version of MortSmooth_bbase MortalitySmooth
ndx <- df-deg
dx <- (xr - xl)/ndx
knots <- seq(xl - deg * dx, xr + deg * dx, by = dx)
P <- outer(x, knots, function (x, t) (x - t)^deg * (x > t))
D <- diff(diag(dim(P)[2]), diff = deg + 1)/(gamma(deg + 1) * dx^deg)
B <- (-1)^(deg + 1) * P %*% t(D)
B
}
warn.list <- list()
if (any(abs(as.integer(x) - x) > 1e-6)) {
WARN <- 'Fractional values in age vector. Values were rounded'
warning(immediate. = TRUE, WARN)
warn.list <- c(warn.list, WARN)
}
#x <- as.integer(round(x))
if ((length(x) < (3 + control$bs.deg)) && (control$bs.use)) {
WARN <- 'Not enough data classes to use B-spline basis. Exact method was used'
warning(immediate. = TRUE, WARN)
warn.list <- c(warn.list, WARN)
control$bs.use <- FALSE
}
if (control$zero.class.add) {
if (length(control$zero.class.end) == 0){
mstep <- min(diff(x))
r <- diff(range(x))
control$zero.class.end <- ceiling(ceiling(r * control$zero.class.frac) / mstep) * mstep + max(x)
}
y <- c(y, 0)
open.int.len <- control$zero.class.end - x[length(x)]
x <- c(x, control$zero.class.end)
} else open.int.len <- NA
segm <- round(x * control$x.div)
if (length(unique(segm)) != length(segm)) stop('Non-unique values in x after rounding. Try to increase x.max.ext, x.div, or round age classes manually.')
C2 <- approx(x = segm + 1, y = 1:length(segm), method = 'constant', xout = (min(segm):(max(segm))) + 1, rule = 2)
SparMat <- Matrix::sparseMatrix(C2$y, C2$x)
C. <- 1 * Matrix(SparMat, sparse = FALSE)
if (control$bs.use) {
if (length(exposures) > 0) stop('Exposures cannot be used together with B-Splines.')
control$bs.method <- control$bs.method[1]
control$bs.df <- control$bs.df[1]
if (control$bs.df == "maxprec") control$bs.df <- min(control$bs.df.max, length(x) * control$x.div-2)
else if (tolower(control$bs.df) == 'thumb') control$bs.df <- min(control$bs.df.max, (length(x) * control$x.div-2)/4)
else if(is.na(as.numeric(control$bs.df))) stop(paste('Wrong bs.df parameter:', control$bs.df))
control$bs.df <- max(length(segm)-2, min(control$bs.df, length(segm) * control$x.div-2))
if (control$bs.method == 'bs'){
X <- splines::bs(C2$x, df = control$bs.df)
} else if (tolower(control$bs.method) == 'mortalitysmooth'){
X <- .MortSmooth_Bbase(C2$x, df = control$bs.df)
} else stop('Unknown B-spline basis method.')
} else{
if (length(exposures) > 0){
d <- length(x)-length(exposures)
if (d > 0) exposures <- c(exposures, rep(1e-6, d))
expo <- as.matrix(do.call('cbind', rep(list(exposures), dim(C.)[2])))
# expo <- (rep(c(exposures), dim(C.)[2]))
# dim(expo) <- rev(dim(C.))
# expo <- t(expo)
C. <- C. * expo
} else expo <- NULL
#control$bs.df <- NA
X <- diag(dim(C.)[2])
}
list(C = as.matrix(C.),
X = X,
y = y,
x = x,
open.int.len = open.int.len,
exposures = exposures,
control = control,
warn.list = warn.list)
}
# pclm.core ------------------------------------------------------------------------
#' Fit the penalized composite link model (PCLM)
#'
#' @description
#' Efficient estimation of smooth distribution from coarsely grouped data based on
#' PCLM algorithm described in Rizzi et al. 2015.
#' For further description see the reference [1] and \code{\link{pclm.default}}.\cr\cr
#' \emph{\bold{The internal function}}.
#' @param CompositionMatrix Object constructed by \code{\link{pclm.compmat}}.
#' @param lambda Smoothing parameter.
#' @param control List with additional parameters. See \code{\link{pclm.control}}.
#' @return List with components:
#' @return \item{\code{gamma}}{ Ungrouped counts.}
#' @return \item{\code{dev}}{Deviance.}
#' @return \item{\code{aic}}{AIC for the fitted model.}
#' @return \item{\code{bic}}{BIC for the fitted model.}
#' @return \item{\code{warn.list}}{List with warnings.}
#' @author
#' Silvia Rizzi (original code, see Appendix #2 of the reference [1]), Maciej J. Danko (modification) <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @references
#' \enumerate{
#' \item{Rizzi S, Gampe J, Eilers PHC. Efficient estimation of smooth distributions from coarsely grouped data. Am J Epidemiol. 2015;182:138?47.}
#' }
#' @keywords internal
pclm.core <- function(CompositionMatrix, lambda = 1, control = list()){
control <- do.call("pclm.control", control)
warn.list <- list()
C <- CompositionMatrix$C
X <- CompositionMatrix$X
y <- CompositionMatrix$y
y <- as.matrix(as.vector(y))
nx <- dim(X)[2] #number of small classes
D <- diff(diag(nx), diff = control$pclm.deg)
bstart <- log(sum(y) / nx);
b <- rep(bstart, nx);
was.break <- FALSE
for (it in 1:control$pclm.max.iter) {
b0 <- b
eta <- X %*% b
gam <- exp(eta)
mu <- C %*% gam
w <- c(1 / mu, rep(lambda, nx - control$pclm.deg))
Gam <- gam %*% rep(1, nx)
Q <- C %*% (Gam * X)
z <- c(y - mu + Q %*% b, rep(0, nx - control$pclm.deg))
ls.x <- rbind(Q, D)
Fit <- lsfit(ls.x, z, wt = w, intercept = FALSE, tolerance = control$pclm.lsfit.tol)
b <- Fit$coef
db <- max(abs(b - b0))
if (db < control$pclm.tol) {
was.break <- TRUE
break
}
}
if (!was.break) {
WARN <- 'Maximum iteration reached without convergence. Try to increase pclm.max.iter parameter.'
warning(immediate. = TRUE, WARN)
warn.list <- c(warn.list, WARN)
}
R <- t(Q) %*% diag(c(1 / mu)) %*% Q
H <- solve(R + lambda * t(D) %*% D) %*% R
fit <- list()
.trace <- sum(diag(H))
ok <- y > 0 & mu > 0
fit$dev <- 2 * sum(y[ok] * log(y[ok] / mu[ok]))
fit$gamma <- gam
fit$aic <- fit$dev + 2 * .trace
fit$bic <- fit$dev + .trace*log(length(y))
fit$warn.list <- warn.list
fit
}
# pclm.opt --------------------------------------------------------------------
#' Optimize the smooth parameter \code{lambda} for PCLM method
#' @description
#' \emph{\bold{The internal function}}.
#' @param CompositionMatrix Output of \code{\link{pclm.compmat}}.
#' @param control List with additional parameters. See \code{\link{pclm.control}}.
#' @return List with components:
#' @return \item{\code{X}}{Ungrouped age/time classes.}
#' @return \item{\code{Y}}{Ungrouped counts.}
#' @return \item{\code{lambda}}{Optimal smooth parameter.}
#' @return \item{\code{fit}}{Output of the \code{\link{pclm.core}} derived for the optimal \code{lambda}.}
#' @return \item{\code{CompositionMatrix}}{Used composition matrix. See also \code{\link{pclm.compmat}}.}
#' @return \item{\code{warn.list}}{List with warnings.}
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @references
#' \enumerate{
#' \item{Rizzi S, Gampe J, Eilers PHC. Efficient estimation of smooth distributions from coarsely grouped data. Am J Epidemiol. 2015;182:138?47.}
#' }
#' @keywords internal
pclm.opt<-function(CompositionMatrix, control = list()){
warn.list <- list()
control <- do.call("pclm.control", control)
tryme<-function(G, Altern = 1e200) suppressWarnings(if (class(try(G, silent = TRUE)) == "try-error") Altern else try(G, silent = TRUE))
if (toupper(control$opt.method) == 'AIC') opty<-function (log10lam) tryme(pclm.core(CompositionMatrix, lambda = 10^log10lam, control = control)$aic) else
if (toupper(control$opt.method) == 'BIC') opty<-function (log10lam) tryme(pclm.core(CompositionMatrix, lambda = 10^log10lam, control = control)$bic) else
stop('Unknown method of lambda optimization.')
res.opt <- optimize(f = opty, interval = c(-12, 22), tol = control$opt.tol)$minimum
if((round(res.opt) <= -11.9) || (round(res.opt) >= 21.9)) {
WARN <- 'Lambda reached boundary values.'
warning(immediate. = TRUE, WARN)
warn.list <- c(warn.list, WARN)
}
lambda <- 10^res.opt
#cat(res.opt,'\n')
fit <- pclm.core(CompositionMatrix, lambda = lambda, control = control)
X <- seq(min(CompositionMatrix$x), max(CompositionMatrix$x), 1 / control$x.div)
Y <- fit$gamma
Z <- list(X = X, Y = Y, lambda = lambda, fit = fit, CompositionMatrix = CompositionMatrix,
warn.list = warn.list)
Z
}
# pclm.aggregate ---------------------------------------------------
#' Calculate raw and aggregated life table from the object returned by \code{pclm.opt()} function.
#' @description
#' \emph{\bold{The internal function}}.
#' @param fit Object obtained by \code{\link{pclm.opt}} function.
#' @param out.step Output interval length.
#' @param count.type Type of the data, deaths(\code{"DX"})(default) or exposures(\code{"LX"}.)
#' @param exposures.used Logical indicating if exposures were used to create composition matrix.
#' @return List with components:
#' @return \item{\code{grouped}}{Life-table based on aggregated PCLM fit defined by \code{out.step}.}
#' @return \item{\code{raw}}{Life-table based on original (raw) PCLM fit.}
#' @return \item{\code{fit}}{PCLM fit used to construct life-tables.}
#' @return \item{\code{warn.list}}{List with warnings.}
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @seealso \code{\link{pclm.default}}
#' @keywords internal
pclm.aggregate<-function(fit, out.step = NULL, count.type = c('DX', 'LX'), exposures.used = FALSE){
count.type <- count.type[1]
p <- function(x) round(x, floor(-log10(.Machine$double.eps^0.8)))
warn.list <- fit$warn.list
Y <- fit$Y
X <- fit$X
n <- diff(X)[1] #c(diff(X), diff(X)[length(X)-1])
#Standardize Y to reflect true population size
if ((!exposures.used) && (toupper(count.type) == 'DX')) Y <- (Y / sum(Y)) * sum(fit$CompositionMatrix$y)
if (length(out.step) == 0) x <- p(fit$CompositionMatrix$x) else {
if (p(out.step)<p(n)) {
WARN <- 'Output age interval length (out.step) was too small and was adjusted. It equals the smallest age class now. Re-fit PCLM with higher x.div.'
warning(immediate. = TRUE, WARN)
warn.list <- c(warn.list, WARN)
out.step <- n
}
tmp <- round(out.step/n) * n
if (p(tmp) != p(out.step)) {
WARN <- 'Output age interval length (out.step) rounded to an integer multiple of the smallest age class'
warning(immediate. = TRUE, WARN)
warn.list <- c(warn.list, WARN)
}
out.step <- tmp
x <- p(unique(c(seq(X[1], X[length(X)], out.step), X[length(X)])))
}
if (toupper(count.type) == 'DX'){
if (!exposures.used){
ax <- rep(NA, length(x)-1)
Dx <- rep(NA, length(x)-1)
for (j in 1:(length(x)-1)){ # use aggregate() in the next version
ind <- (x[j] <= X) & (x[j + 1] > X)
sDx <- sum(Y[ind])
mX <- sum(Y[ind] * (X[ind] - X[ind][1] + n/2))
ax[j] <- mX / sDx
Dx[j] <- sDx
}
N <- sum(Dx)
grouped <- data.frame(x = x[-length(x)], lx = N - c(0, cumsum(Dx)[-length(Dx)]),
dx = Dx, ax = ax, n = diff(x), Ax = ax / diff(x))
N <- sum(Y)
# N <- sum(fit$CompositionMatrix$y) #true population size
raw <- data.frame(x = X, lx = N - c(0, cumsum(Y)[-length(Y)]), dx = Y,
ax = c(0.5 * diff(X), 0.5*diff(X)[length(X)-1]),
n = n, Ax = 0.5)
} else {
MX = rep(NA, length(x)-1)
for (j in 1:(length(x)-1)){ # use aggregate() in the next version
ind <- (x[j] <= X) & (x[j + 1] > X)
MX[j] <- sum(Y[ind], na.rm=TRUE)
}
grouped <- data.frame(x = x[-length(x)], mx = MX, n = out.step)
raw <- data.frame(x = X, mx = Y, n = n)
}
} else if (toupper(count.type) == 'LX'){
if (exposures.used) stop('You cannot give exposures as extra parameter if you already selected "DX" as count.type.')
Lx <- rep(NA, length(x) - 1)
for (j in 1:(length(x) - 1)){
ind <- (x[j] <= X) & (x[j + 1] > X)
sLx <- sum(Y[ind])
Lx[j] <- sLx
}
grouped <- data.frame(x = x[-length(x)], Lx = Lx, n = diff(x))
raw <- data.frame(x = X, Lx = Y, n = n)
} else stop('Unknown life-table type')
object <- list(grouped = grouped,
raw = raw,
fit = fit,
warn.list = warn.list)
object
}
# pclm.default -------------------------------------------------------------------------
#' Fitting Penalized Composite Linear Model (PCLM)
#'
#' @description
#' The PCLM method is based on the composite link model, with
#' a penalty added to ensure the smoothness of the target distribution.
#' Estimates are obtained by maximizing a
#' penalized likelihood. This maximization is performed efficiently by a version
#' of the iteratively reweighted least-squares
#' algorithm. Optimal values of the smoothing parameter are chosen by minimizing
#' Bayesian or Akaike’ s Information Criterion
#' [From Rizzi et al. 2015 abstract].
#' @param x Vector with start of the interval for age/time classes.
#' @param y Vector with counts, e.g. \code{ndx}. It must have the same length as \code{x}.
#' @param count.type Type of the data, deaths(\code{"DX"})(default) or exposures(\code{"LX"}.)
#' @param out.step Age interval length in output aggregated life-table. If set to \code{"auto"}
#' then the parameter is automatically set to the length of the shortest age/time interval of \code{x}.
#' @param exposures Optional exposures to calculate smooth mortality rates.
#' A vector of the same length as \code{x} and \code{y}. See reference [1] for further details.
#' @param control List with additional parameters. See \code{\link{pclm.control}}.
#' @details
#' The function has four major steps:
#' \enumerate{
#' \item{Calculate interval multiple (\code{\link{pclm.interval.multiple}} to remove fractional parts from \code{x} vector.
#' The removal of fractional parts is necessary to build composition matrix.}
#' \item{Calculate composition matrix using \code{\link{pclm.compmat}}.}
#' \item{Fit PCLM model using \code{\link{pclm.opt}}.}
#' \item{Calculate aggregated (grouped) life-table using \code{\link{pclm.aggregate}}.}
#' }
#' More details for PCLM algorithm can be found in reference [1], but see also \code{\link{pclm.compmat}}.
#' @return
#' The output is of \code{"pclm"} class with the components:
#' @return \item{\code{grouped}}{Life-table based on aggregated PCLM fit and defined by \code{out.step}.}
#' @return \item{\code{raw}}{Life-table based on original (raw) PCLM fit.}
#' @return \item{\code{fit}}{PCLM fit used to construct life-tables.}
#' @return \item{\code{m}}{Interval multiple, see \code{\link{pclm.interval.multiple}}, \code{\link{pclm.compmat}}.}
#' @return \item{\code{x.div}}{Value of \code{x.div}, see \code{\link{pclm.control}}.}
#' @return \item{\code{out.step}}{Interval length of aggregated life-table, see \code{\link{pclm.control}}.}
#' @return \item{\code{control}}{Used control parameters, see \code{\link{pclm.control}}.}
#' @return \item{\code{warn.list}}{List with warnings.}
#' @author Maciej J. Danko <\email{danko@demogr.mpg.de}> <\email{maciej.danko@gmail.com}>
#' @examples
#' \dontrun{
#' # Use a simple data set. Naive life-table.
#' # Age:
#' x <- c(0, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
#' # Death counts:
#' dx <- c(38, 37, 17, 104, 181, 209, 452, 1190, 2436, 3164, 1852, 307, 13)
#' # Survivors at the beginning of age class
#' lx <- sum(dx)-c(0, cumsum(dx[-length(dx)]))
#' # Interval length
#' n <- diff(c(x,110))
#' # Approximation of mortality per age class
#' mx <- - log(1 - dx / lx) / n
#' # Mid-interval vector
#' xh <- x + n / 2
#' # Approximated exposures
#' Lx <- n * (lx - dx) + 0.5 * dx *n
#'
#' # *** Use PCLM
#' # Ungroup dataset with out.step equal minimal interval length
#' min(diff(x))
#' AU10p.1a <- pclm.default(x, dx)
#' print(AU10p.1a)
#' plot(AU10p.1a)
#'
#' # Ungroup AU10 with out.step equal minimal interval length
#' # and get good estimates of nax
#' AU10p.1b <- pclm.default(x, dx, control = list(x.div = 10))
#' print(AU10p.1b)
#' plot(AU10p.1b)
#' # This time number of internal (raw) PCLM classes was high
#' # and automatically P-splines were used to prevent long computations
#'
#' # This number can be estimated before performing
#' # PCLM calclualtions:
#' pclm.nclasses(x, control = list(x.div = 10))
#' # which is the same as in the fitted model
#' length(AU10p.1b$raw$x)
#' # whereas number of classes in the aggregated life-table
#' # depends on out.step
#' length(AU10p.1b$grouped$x)
#'
#' # To speed-up computations we can decrease the number of P-spline knots
#' AU10p.1c <- pclm.default(x, dx, control = list(x.div = 10,
#' bs.use = TRUE, bs.df.max = 100))
#'
#' # *** Diagnostic plots for fitted PCLM model
#' # Aggregated PCLM fit:
#' plot(AU10p.1b, type = 'aggregated')
#' # Raw PCLM fit before aggregation:
#' plot(AU10p.1b, type = 'nonaggregated')
#'
#' # In this PCLM fit aggregated life-table is identical
#' # with nonaggregated
#' plot(AU10p.1a, type = 'aggregated')
#' plot(AU10p.1a, type = 'nonaggregated')
#'
#' # *** Combined summary of pash and pclm objects
#' summary(AU10p.1a)
#' summary(AU10p.1b)
#' summary(AU10p.1c)
#'
#' # *** Smooth and aggregate data into 12-year interval
#' AU10p.2 <- pclm.default(x, dx, out.step = 12)
#' print(AU10p.2)
#' print(AU10p.2, type = 'aggregated') # grouped PCLM life-table
#' print(AU10p.2, type = 'nonaggregated') # raw PCLM life-table
#' plot(AU10p.2)
#'
#' # *******************************************************************
#' # Usage of PCLM methods to fit and plot mortality data
#' # *******************************************************************
#'
#' AU10p.4a <- pclm.default(x, dx, control = list(x.div = 5))
#' X <- AU10p.4a$grouped$x
#' M <- -log(1 - AU10p.4a$grouped$dx/AU10p.4a$grouped$lx)
#' plot(X, log10(M), type='l', lwd = 2,
#' xlim=c(0,130), xlab='Age', ylab='log_10 mortality', col = 2)
#' lines(xh, log10(mx1), type = 'p')
#' tail(AU10p.4a, n = 10)
#' #note that lx has standardized values
#'
#' # Improving the plot to cover more age classes
#' AU10p.4b <- pclm.default(x, dx, control = list(zero.class.end = 150,
#' x.div = 4))
#' X <- AU10p.4b$grouped$x
#' M <- -log(1 - AU10p.4b$grouped$dx / AU10p.4b$grouped$lx)
#'
#' plot(X, log10(M), type='l', lwd = 2,
#' xlim=c(0,130), xlab='Age', ylab='log_10 mortality', col = 2)
#' lines(xh, log10(mx1), type = 'p')
#' tail(AU10p.4a, n = 10)
#'
#' # The change of the order of the difference in pclm algorithm may
#' # affect hte interpretation of the tail.
#' # Try to check pclm.deg = 4 and 5.
#' AU10p.4c <- pclm.default(x, dx, control = list(zero.class.end = 150,
#' x.div = 1, pclm.deg = 4))
#' X <- AU10p.4c$grouped$x
#' M <- -log(1 - AU10p.4c$grouped$dx / AU10p.4c$grouped$lx)
#' plot(X, log10(M), type='l', lwd = 2,
#' xlim=c(0,130), xlab='Age', ylab='log_10 mortality', col = 2)
#' lines(xh, log10(mx1), type = 'p')
#'
#' # Using exposures to fit mortality,
#' # Notice that different approximation of mortality rate is used than in
#' # previous cases.
#' AU10p.4c <- pclm.default(x, dx, exposures = Lx, control = list(zero.class.end = 150,
#' x.div = 1, pclm.deg = 2, bs.use = FALSE))
#' X <- AU10p.4c$grouped$x
#' M <- AU10p.4c$grouped$mx
#' plot(X, log10(M), type='l', lwd = 2,
#' xlim=c(0,130), xlab='Age', ylab='log_10 mortality', col = 2)
#' lines(xh, log10((dx / Lx) / n), type = 'p')
#'
#' # *******************************************************************
#' # Usage of PCLM methods for more complicated dataset
#' # - understanding the out.step, x.div, and interval multiple
#' # *******************************************************************
#'
#' # *** Generate a dataset with varying and fractional interval lengths
#' x <- c(0, 0.6, 1, 1.4, 3, 5.2, 6.4, 8.6, 11, 15,
#' 17.2, 19, 20.8, 23, 25, 30)
#' dx <- ceiling(10000*diff(pgamma(x, shape = 3.8, rate = .4)))
#' barplot(dx/diff(x), width = c(diff(x), 2)) # preview
#' lx <- 10000-c(0, cumsum(dx))
#' dx <- c(dx, lx[length(lx)])
#'
#' # *** Fit PCLM with automatic out.step
#' Bp1 <- pclm.default(x, dx)
#' # Output interval length (out.step) is automatically set to 0.4
#' # which is the minimal interval length in original data.
#' min(diff(x))
#' summary(Bp1) #new out.step can be also read from summary
#' plot(Bp1)
#'
#' # *** Setting manually out.step
#' Bp2 <- pclm.default(x, dx, out.step = 1)
#' plot(Bp2, type = 'aggregated') # The fit with out.step = 1
#' plot(Bp2, type = 'nonaggregated') # It is clear that
#' # PCLM extended internal interval length even without changing x.div
#' # It was done because of the fractional parts in x vector.
#' # This is also a case for Bp1
#' summary(Bp2) #PCLM interval length = 0.2
#' Bp2$raw$n[1:10]
#'
#' # *** Setting manually out.step to a smaller value than
#' # the smallest original interval length
#' Bp3 <- pclm.default(x, dx, out.step = 0.1)
#' summary(Bp3)
#' # We got a warning as out.step cannot be smaller than
#' # smallest age class if x.div = 1
#'
#' # We can change x.div to make it possible
#' Bp3 <- pclm.default(x, dx, out.step = 0.1, control = list(x.div = 2))
#' #0.1 is two times smaller than minimal interval length
#' summary(Bp3) # We were able to change the interval
#' plot(Bp3)
#' # NOTE: In this case x.div has not sufficient value to
#' # get good axn estimates
#' Bp3$grouped$ax[1:10]
#'
#' # This can be changed by the further increase of x.div
#' Bp4 <- pclm.default(x, dx, out.step = 0.1, control = list(x.div = 20))
#' Bp4$grouped$ax[1:10]
#' # NOTE: This time P-spline approximation was used because
#' # the composition matrix was huge
#'
#' # Finally, we were able to get our assumed out.step
#' Bp4$grouped$n[1:10]
#'
#' # In the fitted model the interval multiple (m) is 5.
#' (m <- pclm.interval.multiple(x, control = list(x.div = 20)))
#' summary(Bp4)
#' # Interval multiple determines
#' # the maximal interval length in raw PCLM life-table,
#' (K <- 1 / m)
#' # which is further divided by x.div.
#' K / 20
#' # Simply: 1 / (m * x.div) = 1 / (5 * 20) = 0.01
#' # The interval in the raw PCLM life-table is 10 times shorter than
#' # in the grouped life-table
#' # interval length in aggregated PCLM life-table:
#' Bp4$grouped$n[1:10]/ # divided by
#' # interval length in nonaggregated PCLM life-table:
#' Bp4$raw$n[1:10]
#' # NOTE: The interval for the raw PCLM life-table depends
#' # on original interval, m, and x.div,
#' # whereas the grouped PCLM interval length is set by out.step,
#' # which could be eventually increased if out.step < raw PCLM
#' # interval length.
#'
#' # **** See more examples in the help for pclm.nclasses() function.
#' }
#' @references
#' \enumerate{
#' \item{Rizzi S, Gampe J, Eilers PHC. Efficient estimation of smooth distributions from coarsely grouped data. Am J Epidemiol. 2015;182:138?47.}
#' \item{Rizzi S, Thinggaard M, Engholm G, et al. Comparison of non-parametric methods for ungrouping coarsely aggregated data. BMC Medical Research Methodology. 2016;16:59. doi:10.1186/s12874-016-0157-8.}
#' }
#' @seealso \code{\link{pclm.compmat}}, \code{\link{pclm.interval.multiple}}, and \code{\link{pclm.nclasses}}.
#' @export
pclm.default <- function(x, y, count.type = c('DX', 'LX'), out.step = 'auto', exposures = NULL, control = list()){
control <- do.call("pclm.control", control)
count.type <- count.type[1]
exposures.used <- (length(exposures) > 0)
if (control$bs.use == 'auto') {
control$bs.use <- (pclm.nclasses(x, control) >= control$bs.df.max)
message(paste('bs.use automatically set to', control$bs.use))
} else if (!is.logical(control$bs.use)) stop('bs.use can take only "auto", TRUE or FALSE values.')
if ((pclm.nclasses(x, control) >= control$bs.df.max) && (control$bs.use == FALSE)) warning(immediate. = TRUE, 'Big composition matrix detected. Calculations can be slow. Set "bs.use" to TRUE.')
if (tolower(out.step) == 'auto') {
out.step <- min(diff(x), 1) # removed: 1/pclm.interval.multiple(x, control)
message(paste('out.step automatically set to', out.step))
} else if (!is.numeric(out.step)) stop('Unknown command for out.step. Set out.step as "auto" or as numeric value.')
if (!control$zero.class.add) warning(immediate. = TRUE, 'Omitting zero.class may lead to biased results.')
if ((control$zero.class.add) && (length(control$zero.class.end)>0) && (control$zero.class.end <= max(x))) stop ("zero.class.end lower than last age class.")
if ((toupper(count.type) == 'LX') && (length(exposures) > 0)) stop('You cannot give exposures as extra parameter if you already selected "DX" as count.type.')
WARN <- list()
if (all(order(x) != 1:length(x))) stop ('Age classes are not ordered')
#Automatically change the scale to make x integer
#Output:
# m - magnitude of change,
# x - transformed vector of x
if (control$x.auto.trans) {
m <- pclm.interval.multiple(x, control)
if (m == control$x.max.ext){
WARN <- 'Too small age interval found. Age classes were rounded.'
x <- round(x * control$x.max.ext) / control$x.max.ext
m <- pclm.interval.multiple(x)
warning(immediate. = TRUE, WARN)
} else {
if (m > 1) WARN <- 'Age vector automatically transformed.'
}
x <- as.integer(round(x * m * control$x.div)) / control$x.div #transform
#x <- as.integer(round(x*m)) #OLD, but x is anyway multiplied by x.div in pclm.compmat
} else m <- 1
# Calculate composition matrix
CM <- pclm.compmat(x, y, exposures = exposures, control = control)
#control <- CM$control
#CM$control <- NULL
# Fit PCLM model
fit <- pclm.opt(CompositionMatrix = CM, control = control)
# Change age/time units back to fractional (backward transformation)
fit$CompositionMatrix$x <- fit$CompositionMatrix$x / m
fit$X <- fit$X / m
fit$CompositionMatrix$open.int.len <- fit$CompositionMatrix$open.int.len / m
# Construct grouped (aggregated) and ungrouped (nonaggregated) life-tables
GLT <- pclm.aggregate(fit, out.step = out.step, count.type = count.type[1], exposures.used = exposures.used)
# Construct pclm object
GLT$m <- m
GLT$x.div <- control$x.div
GLT$out.step <- out.step
GLT$warn.list <- c(CM$warn.list, GLT$warn.list, WARN)
GLT$exposures <- exposures
GLT$control <- control
class(GLT) <- 'pclm'
return(GLT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.