Nothing
#' Fit a generalised additive point process model
#'
#' @param formula a formula for a Poisson process log intensity function (compatible with \link[mgcv]{gam})
#' @param data a data frame
#' @param nodes a list or data frame; see `Details'
#' @param weights a scalar, list or vector; see `Details'
#' @param nquad a scalar giving the number of quadrature nodes for each variable
#' @param approx a length 2 character string; see `Details'
#' @param knots spline knots to pass to \link[mgcv]{gam}
#' @param use.data should splines should be constructed from \code{data} (otherwise uses \code{nodes})?
#' @param trace integers controlling what's reported. Defaults to 0
#'
#' @details
#'
#' \code{ppgam} fits a Poisson process with intensity function \eqn{\lambda({\bf x})} for covariate
#' \eqn{{\bf x} = (x_1, \ldots, x_d)}. The likelihood for this model with events occurring at \eqn{{\bf x}_i},
#' for \eqn{i=1, \ldots, n}, is approximated by quadrature with
#'
#' \deqn{\exp\bigg[-\sum_{j=1}^m w_j \lambda({\bf x}_j^*)\bigg] \prod_{i=1}^n \lambda({\bf x}_i)}
#'
#' where \eqn{{\bf x}_j^*} and \eqn{w_j} are quadrature nodes and weights, for \eqn{j=1, \ldots, m},
#' defined with \code{nodes} and \code{weights}.
#'
#' \code{formula} gives the formula for the log intensity function of a Poisson process.
#' It is passed to \link[mgcv]{gam}. If \code{formula} has no response, i.e. \code{ ~ s(...)},
#' then \code{data} is assumed to give the times at which events occur. Then \code{nodes}
#' is used to control integration of the intensity function. If \code{formula} has a response,
#' e.g. \code{y ~ s(...)}, then \code{y} is assumed binary, comprising only zeros and ones.
#' Then \code{data} is assumed to give the state space of the Poisson process,
#' (e.g. daily time steps if occurrences of events are measured in days)
#' and ones in \code{y} identify when events occur. Note that
#' if \code{formula} has no response, \code{data} will have \eqn{n} rows, and \eqn{m} rows otherwise.
#'
#' \code{nodes} is used to supply nodes for integrating the Poisson
#' process intensity function by quadrature. It is supplied as a list or data
#' frame.
#'
#' If \code{nodes} is a list, its names must correspond to variables on
#' the r.h.s. of formula. Elements of the list, \code{x}, say, can be a vector
#' or 2-column matrix, where \code{length(x) > 1} or \code{nrow(x) > 1}. If a
#' matrix, its first and second columns are taken as integration nodes and
#' weights, respectively. If a vector of length 2, it is assumed to give the
#' range of the \code{nquad} midpoints used as integration nodes. If a longer vector,
#' it is assumed to be the integration nodes, and \code{nquad} is ignored.
#'
#' If \code{nodes} is a data frame, it is assumed to give the integration nodes.
#'
#' \code{nquad} specifies the number of integration nodes per variable, unless
#' nodes are specified in \code{nodes}. If a single integer and
#' \code{is.null(names(nquad))} it is used for all variables. Otherwise,
#' names are matched to variables. An error is returned if any variables do not
#' have values specified.
#'
#' \code{weights} controls the quadrature weights. If \code{nodes} is a list,
#' a scalar multiplies any weights calculated alongside nodes, i.e. node separations.
#' If \code{nodes} is a data frame, weights can be a scalar that is repeated
#' \code{nrow(nodes)}, or a vector of length \code{nrow(nodes)} that gives the weights
#' for each row of \code{nodes}.
#'
#' \code{approx} controls quadrature details. Its first term controls the
#' integration method, which uses either midpoint (\code{"midpoint"}, default),
#' Simpson's (\code{"Simpson"}) or Gauss-Legendre (\code{"Gauss"}) rules. The second
#' term of \code{approx} controls the integration range, which is either the
#' range of the variable (\code{"exact"}), or by calling \code{pretty()} (\code{"pretty"}).
#'
#' \code{trace} controls what is reported. Details of convergence are printed
#' with \code{trace = 1}, of nodes with \code{trace = 2}, and \code{trace = 3}
#' prints both.
#'
#' @references
#'
#' Wood, S. N., Pya, N., & Safken, B. (2016). Smoothing parameter and model selection for general
#' smooth models. Journal of the American Statistical Association, 111(516), 1548-1563.
#'
#' Youngman, B. D., & Economou, T. (2017). Generalised additive point process models for natural
#' hazard occurrence. Environmetrics, 28(4), e2444.
#'
#' @return
#'
#' An object of class \code{gam}, as returned by \code{mgcv::gam}, with parameters,
#' covariance matrices and a few other things swapped
#'
#' @examples
#'
#' # Times of landfalling US hurricanes
#' data(USlandfall)
#'
#' # convert dates to years, as a continuous variable
#' year <- as.integer(format(USlandfall$date, "%Y"))
#' day <- as.integer(format(USlandfall$date, "%j"))
#' USlandfall$year <- year + pmin(day / 365, 1)
#' hits <- subset(USlandfall, landfall == 1)
#'
#' # this creates nodes in the default way
#' m1 <- ppgam( ~ s(year), hits)
#'
#' # some examples of providing nodes
#' nodes.year <- list(year=pretty(USlandfall$year, 20))
#' # as 2 is in trace, nodes and weights are printed
#' m2 <- ppgam( ~ s(year), hits, nodes = nodes.year, trace = 2)
#'
#' # alternatively, we might just want to specify how many nodes to use
#' m3 <- ppgam( ~ s(year), hits, nquad = 30)
#'
#' \donttest{
#'
#' data(windstorm)
#' m4 <- ppgam(~ s(lon, lat, k=20), windstorm)
#'
#' ## Storm peak locations, given the North Atlantic Oscillation (NAO) index
#' # NAO values from https://crudata.uea.ac.uk/cru/data/nao/nao.dat
#' # NAO midpoints and weights based on `hist'
#'
#' NAO.mids <- c(-2.75, -2.25, -1.75, -1.25, -0.75, -0.25, 0.25, 0.75, 1.25, 1.75, 2.25)
#' NAO.wts <- c(0.002, 0.014, 0.057, 0.145, 0.302, 0.427, 0.463, 0.364, 0.171, 0.047, 0.007)
#'
#' m5 <- ppgam(~ te(lat, lon, NAO, d = 2:1, k = c(40, 8), bs = c("ts", "cr")), windstorm,
#' nodes = list(NAO = cbind(NAO.mids, NAO.wts)))
#'
#' }
#'
#' @export
ppgam <- function(formula, data, nodes = NULL, weights = 1, nquad = 15,
approx = c("midpoint", "exact"), knots = NULL, use.data = TRUE, trace = 0) {
# convert and objects of class "Date" to integer
data.class <- sapply(data, class)
if (any(data.class == "Date")) for (i in which(data.class == "Date")) data[,i] <- as.integer(data[,i])
# identify whether response is present
gam.info <- mgcv::interpret.gam(formula)
rsp <- gam.info$response
with.response <- !is.null(rsp)
if (with.response) {
# check whether response okay
if(!all(data[,rsp] %in% 0:1))
stop(paste("Some data$", rsp, " not in {0, 1}", sep=""))
nodes <- data
data <- subset(data, data[,rsp] == 1)
} else {
# add in fake response for mgcv::gam
formula0 <- formula
rsp <- "fake_response"
data$fake_response <- sample(0:1, nrow(data), replace=TRUE)
formula <- as.character(formula)
if (length(formula) > 2) {
formula <- paste(rsp, "~", paste(formula[-1], collpase=" + "))
} else {
formula <- paste(rsp, "~", formula[-1])
}
formula <- as.formula(formula)
gam.info <- mgcv::interpret.gam(formula)
}
# identify covariates
vars <- gam.info$fake.names
nvars <- length(vars)
nquad <- .make_nquad(nquad, vars)
## set up quadrature nodes and weights
# identify node type
# node.type == 0 is data.frame, so nodes assumed
# node.type == 1 is list, so nodes produced
if (is.data.frame(nodes)) {
node.type <- 0
# nothing else needs to be done to nodes
} else {
node.type <- 1
}
w0 <- weights
if (!is.data.frame(nodes)) {
approx <- .check.approx(approx)
nodes <- lapply(vars, function(x) nodes[[x]])
nodes <- lapply(seq_len(nvars), function(i) .make_nodes(nodes[[i]], data[,vars[i]], nquad[i], approx, trace))
names(nodes) <- vars
if (trace %in% 2:3) {
cat("\nQuadrature nodes and weights:\n")
lapply(seq_along(nodes), function(i) .print.nodes(nodes[[i]], names(nodes)[i]))
}
weights <- lapply(nodes, function(x) x[,2])
weights <- Reduce("*", expand.grid(weights))
if (length(w0) > 1)
stop("'weights' of length > 1 not compatible with `nodes' supplied as list. See Details for `ppgam'.")
weights <- w0 * weights
nodes <- lapply(nodes, function(x) x[,1])
nodes <- expand.grid(nodes)
n.nodes <- nrow(nodes)
} else {
n.nodes <- nrow(nodes)
if (length(w0) > 1 & length(w0) < nrow(nodes)) {
stop("length `weights' must be 1 or nrow(nodes)")
} else {
if (length(w0) == 1) {
weights <- rep(w0, n.nodes)
}
}
}
if (use.data) {
G <- mgcv::gam(formula, data=data, knots=knots, fit=FALSE, method="REML", family=poisson)
} else {
G <- mgcv::gam(formula, data=nodes, knots=knots, fit=FALSE, method="REML", family=poisson)
}
class(G) <- c("gam", "glm", "lm")
G$nb <- ncol(G$X)
G$coefficients <- numeric(G$nb)
G$wts <- weights
# now sort design matrices
if (use.data) {
G$XT <- predict(G, newdata=nodes, type="lpmatrix")
} else {
G$XT <- G$X
G$X <- predict(G, newdata=data, type="lpmatrix")
}
G$control <- .control.ppgam()
beta <- .give_beta0(G)
rho0 <- numeric(length(G$sp))
G$Sd <- evgam:::.joinSmooth(list(G))
G$S <- evgam:::.makeS(G$Sd, rho0)
G$null.deviance <- -.f(beta, G)
attr(rho0, "beta") <- beta
fit.reml <- evgam:::.BFGS(rho0, .reml0, .reml1, dat=G, control=G$control$outer, trace=trace %in% c(1, 3))
G$coefficients <- attr(fit.reml$objective, "beta")
H <- attr(fit.reml$objective, "Hessian")
G$Vp <- MASS::ginv(H)
G$S <- evgam:::.makeS(G$Sd, exp(fit.reml$par))
G$deviance <- -.f(G$coefficients, G)
H0 <- H - G$S
G$edf <- colSums(G$Vp %*% H0)
G$logLik <- -.f0(G$coefficients, G)
G$scale.estimated <- 0
G$df.residual <- ncol(G$X) - sum(G$edf)
G$aic <- 2 * (G$df.residual - G$logLik)
G$nobs <- nrow(data)
old <- c("mf", "pP", "cl")
new <- c("model", "paraPen", "call")
is.in <- !is.na(match(old, names(G)))
if (any(is.in)) names(G)[match(old[is.in], names(G))] <- new[is.in]
G$na.action <- attr(G$mf, "na.action")
G$sig2 <- G$scale.estimated <- 1
G$method <- "REML"
if (!with.response) G$formula <- formula0
G$family$no.r.sq <- TRUE
G$gcv.ubre <- as.vector(fit.reml$objective)
set.seed(1)
n.samp <- max(1e3, 2 * G$nb)
if (n.samp < nrow(G$X)) {
G$R <- G$X[sample(nrow(G$X), n.samp, replace=FALSE),]
} else {
G$R <- G$X
}
return(G)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.