R/acf.R
In itsadug: Interpreting Time Series and Autocorrelated Data Using GAMMs

Documented in acf_n_plots acf_plot acf_resid derive_timeseries resid_gam start_event start_value_rho

#' Generate N ACF plots of individual or aggregated time series.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @import graphics
#' @param x A vector with time series data, typically residuals of a 
#' regression model.
#' @param n The number of plots to generate.
#' @param split_by List of vectors (each with equal length as \code{x}) that 
#' group the values of \code{x} into trials or timeseries events. 
#' Generally other columns from the same data frame.
#' @param cond Named list with a selection of the time series events
#' specified in \code{split_by}. Default is NULL, indicating that 
#' all time series are being processed, rather than a selection.
#' @param max_lag Maximum lag at which to calculate the acf. 
#' Default is the maximum for the longest time series.
#' @param fun The function used when aggregating over time series 
#' (depending on the value of \code{split_by}).
#' @param random Logical: determine randomly which \code{n} (aggregated) time 
#' series are plotted, or use the \code{\link{quantile}} function to find a 
#' range of different time series to plot. Default is FALSE (not random).
#' @param mfrow A vector of the form c(nr, nc). The figures will be drawn in 
#' an nr-by-nc array on the device by rows.
#' @param add Logical: whether to add the plots to an exiting plot window or 
#' not. 
#' Default is FALSE.
#' @param plot Logical: whether or not to produce plot. Default is TRUE.
#' @param print.summary Logical: whether or not to print summary.
#' Default set to the print info messages option 
#' (see \code{\link{infoMessages}}).
#' @param ... Other arguments for plotting, see \code{\link[graphics]{par}}.
#' @return \code{n} ACF plots providing information about the autocorrelation 
#' in \code{x}.
#' @author Jacolien van Rij, R. Harald Baayen
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_plot}} for an ACF that takes into account time series 
#' in the data.
#' @examples
#' data(simdat)
#' 
#' # Separate ACF for each time series:
#' acf_n_plots(simdat$Y, split_by=list(simdat$Subject, simdat$Trial))
#' 
#' # Average ACF per participant:
#' acf_n_plots(simdat$Y, split_by=list(simdat$Subject))
#' 
#' \dontrun{
#' # Data treated as single time series. Plot is added to current window.
#' # Note: 1 time series results in 1 plot.
#' acf_n_plots(simdat$Y, add=TRUE)
#' # Plot 4 ACF plots doesn't work without splitting data:
#' acf_n_plots(simdat$Y, add=TRUE, n=4)
#' 
#' # Plot ACFs of 4 randomly selected time series:
#' acf_n_plots(simdat$Y, random=TRUE, n=4, add=TRUE, 
#'     split_by=list(simdat$Subject, simdat$Trial))
#'
#' }
#'
#' #---------------------------------------------
#' # When using model residuals
#' #---------------------------------------------
#' 
#' \dontrun{
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' # simple linear model:
#' m1 <- lm(Y ~ Time, data=simdat)
#'
#' # This will generate an error:
#' # acf_n_plots(resid(m1), split_by=list(simdat$Subject, simdat$Trial))
#'
#' # This should work:
#' el.na <- missing_est(m1)
#' acf_n_plots(resid(m1), 
#'      split_by=list(simdat[-el.na,]$Subject, simdat[-el.na,]$Trial))
#'
#' # This should also work:
#' simdat$res <- NA
#' simdat[!is.na(simdat$Y),]$res <- resid(m1)
#' acf_n_plots(simdat$res, split_by=list(simdat$Subject, simdat$Trial))
#' }
#' 
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#' @family functions for model criticism
acf_n_plots <- function(x, n = 5, split_by = NULL, cond = NULL, max_lag = NULL, fun = mean, plot = TRUE, random = F, 
    mfrow = NULL, add = FALSE, print.summary = getOption("itsadug_print"), ...) {
    
    # get acf data:
    suppressWarnings(acfdat <- acf_plot(x, split_by = split_by, cond = cond, max_lag = max_lag, fun = fun, 
        plot = FALSE, return_all = TRUE))
    if (!nrow(acfdat$acftable) >= n) {
        warning(sprintf("Number of time series in the data (%d) is smaller than n (%d). N is reduced to %d.\n", 
            nrow(acfdat$acftable), n, nrow(acfdat$acftable)))
        n <- nrow(acfdat$acftable)
    }
    # get lag 1A vector:
    lag1.all <- acfdat$acftable$"1"
    rn <- rownames(acfdat$acftable)
    lag1 <- lag1.all[!is.na(lag1.all)]
    nevents <- acfdat$n
    findNum <- rep(1, n)
    out <- list()
    
    if (random) {
        findNum <- sample(nrow(acfdat$acftable), size = n)
    } else {
        q <- quantile(lag1, probs = seq(0, 1, length = n))
        
        findClosestElement <- function(el, vals, num = TRUE) {
            y <- abs(vals - el)
            y <- min(y)
            if (num) {
                return(which(abs(vals - el) == y)[1])
            } else {
                return(vals[which(abs(vals - el) == y)[1]])
            }
        }
        
        findNum <- c()
        for (i in 1:n) {
            findNum <- c(findNum, findClosestElement(q[i], lag1))
            if (i > 1) {
                out[[i - 1]] <- list(quantile = c(q[i - 1], q[i]), elements = data.frame(event = rn[which(lag1.all >= 
                  q[i - 1] & lag1.all < q[i])], lag1 = lag1.all[which(lag1.all >= q[i - 1] & lag1.all < q[i])], 
                  stringsAsFactors = FALSE))
            }
        }
        out[["quantiles"]] <- q
        if (print.summary) {
            cat("Quantiles to be plotted:\n")
            print(q)
        }
    }
    
    if (plot) {
        cols <- ceiling(sqrt(n))
        rows <- ceiling(n/cols)
        
        if (!is.null(mfrow)) {
            cols = mfrow[2]
            rows = mfrow[1]
        }
        
        if (add == FALSE) {
            # dev.new(width = cols * 3, height = rows * 3)
            par(mfrow = c(rows, cols))
            oldmar <- par(mar = rep(3, 4))
        }
        parlist = list(...)
        if ("type" %in% names(parlist)) {
            parlist[["type"]] <- NULL
        }
        xlab <- "Lag"
        ylab <- "ACF"
        ylim <- range(acfdat$acftable[findNum, ], na.rm = T)
        main.new <- NULL
        if ("xlab" %in% names(parlist)) {
            xlab <- parlist[["xlab"]]
            parlist[["xlab"]] <- NULL
        }
        if ("ylab" %in% names(parlist)) {
            ylab <- parlist[["ylab"]]
            parlist[["ylab"]] <- NULL
        }
        if ("ylim" %in% names(parlist)) {
            ylim <- parlist[["ylim"]]
            parlist[["ylim"]] <- NULL
        }
        if ("main" %in% names(parlist)) {
            main.new <- parlist[["main"]]
            parlist[["main"]] <- NULL
        }
        other <- paste(sprintf("%s=%s", names(parlist), parlist), collapse = ",")
        
        for (j in 1:min(n, nrow(acfdat$acftable))) {
            main <- NULL
            if (is.null(main.new)) {
                main <- paste("ACF of", row.names(acfdat$acftable[findNum[j], ]))
            } else {
                if (length(main.new) >= j) {
                  main <- main.new[j]
                } else {
                  main <- main.new[1]
                }
            }
            eval(parse(text = sprintf("plot(as.numeric(colnames(acfdat$acftable)), acfdat$acftable[findNum[j], ], type = 'h', 
                main=main, xlab=xlab, ylab=ylab, ylim = ylim, %s)", 
                other)))
            ci <- -(1/nevents[findNum[j], "n"]) + 2/sqrt(nevents[findNum[j], "n"])
            abline(h = c(-1, 1) * ci, lty = 2, col = "blue")
            abline(h = 0)
        }
    }
    invisible(out)
}





#' Generate an ACF plot of an aggregated time series.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @import graphics
#' @param x A vector with time series data, typically residuals of a 
#' regression model. 
#' (See examples for how to avoid errors due to missing values.)
#' @param split_by List of vectors (each with equal length as \code{x}) that 
#' group the values of \code{x} into trials or timeseries events. 
#' Generally other columns from the same data frame.
#' @param max_lag Maximum lag at which to calculate the acf. 
#' Default is the maximum for the longest time series.
#' @param cond Named list with a selection of the time series events
#' specified in \code{split_by}. Default is NULL, indicating that 
#' all time series are being processed, rather than a selection.
#' @param plot Logical: whether or not to plot the ACF. Default is TRUE.
#' @param fun The function used when aggregating over time series 
#' (depending on the value of \code{split_by}).
#' @param return_all Returning acfs for all time series.
#' @param ... Other arguments for plotting, see \code{\link[graphics]{par}}.
#' @return An aggregated ACF plot and / or optionally a list with the aggregated ACF values.
#' @author Jacolien van Rij
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_n_plots}} for inspection of individual time series.
#' @examples
#' data(simdat)
#'
#' # Default acf function:
#' acf(simdat$Y)
#' # Same plot with acf_plot:
#' acf_plot(simdat$Y)
#' # Average of ACFs per time series:
#' acf_plot(simdat$Y, split_by=list(simdat$Subject, simdat$Trial))
#' # Median of ACFs per time series:
#' acf_plot(simdat$Y, split_by=list(simdat$Subject, simdat$Trial), fun=median)
#'
#' # extract value of Lag1:
#' lag1 <- acf_plot(simdat$Y, 
#'    split_by=list(Subject=simdat$Subject, Trial=simdat$Trial), 
#'    plot=FALSE)['1']
#'
#' #---------------------------------------------
#' # When using model residuals
#' #---------------------------------------------
#' 
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' # simple linear model:
#' m1 <- lm(Y ~ Time, data=simdat)
#'
#' \dontrun{
#' # This will generate an error:
#' acf_plot(resid(m1), split_by=list(simdat$Subject, simdat$Trial))
#' }
#' # This should work:
#' el.na <- missing_est(m1)
#' acf_plot(resid(m1), 
#'      split_by=list(simdat[-el.na,]$Subject, simdat[-el.na,]$Trial))
#'
#' # This should also work:
#' simdat$res <- NA
#' simdat[!is.na(simdat$Y),]$res <- resid(m1)
#' acf_plot(simdat$res, split_by=list(simdat$Subject, simdat$Trial))
#'
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#'
#' @family functions for model criticism
acf_plot <- function(x, split_by = NULL, max_lag = NULL, plot = TRUE, fun = mean, cond = NULL, return_all = FALSE, 
    ...) {
    
    # check x:
    if (!is.vector(x)) {
        if (dim(x)[1] == 1) {
            x <- as.vector(x[1, ])
        } else if (dim(x)[2] == 1) {
            x <- as.vector(x[, 1])
        } else {
            stop(sprintf("%s is not a vector (dim: %d x %d).\n", deparse(substitute(x)), dim(x)[1], dim(x)[2]))
        }
    }
    xname <- deparse(substitute(x))
    plotci <- FALSE
    
    # check length x
    if (length(x) > 1) {
        # find missing values:
        n <- which(is.na(x))
        
        # check split factors:
        if (!is.null(split_by)) {
            for (i in 1:length(split_by)) {
                if (length(split_by[[i]]) != length(x)) {
                  name_split_i <- ifelse(!is.null(names(split_by)[i]), names(split_by)[i], sprintf("split_by[[%d]]", 
                    i))
                  errormessage <- sprintf("Split factor %s is not of same length as %s: %s has %d elements, %s has %d elements.\n
                    See help(acf_plot) for examples how to avoid this error.", 
                    name_split_i, deparse(substitute(x)), deparse(substitute(x)), length(x), name_split_i, 
                    length(split_by[[i]]))
                  stop(errormessage)
                }
            }
        } else {
            # warning(sprintf('No argument for split provided. %s is treated as a single time series.\n',
            # deparse(substitute(x))))
            split_by <- as.factor(rep("Factor", length(x)))
            plotci <- TRUE
        }
        # check condition:
        if (!is.null(cond)) {
            if (!is.null(split_by)) {
                el <- 1:length(split_by[[1]])
                for (i in names(cond)) {
                  if (i %in% names(split_by)) {
                    el <- intersect(el, which(split_by[[i]] %in% cond[[i]]))
                  } else {
                    warning(sprintf("Predictor %s not specified in split_by. cond will be ignored.", i))
                  }
                }
                if (length(el) > 0) {
                  x <- x[el]
                  for (i in names(split_by)) {
                    if (is.factor(split_by[[i]])) {
                      split_by[[i]] <- droplevels(split_by[[i]][el])
                    } else {
                      split_by[[i]] <- split_by[[i]][el]
                    }
                  }
                } else {
                  warning("Specified conditions not found in values of split_by. cond will be ignored.")
                }
                
            } else {
                warning("Split_by is empty, therefore cond will be ignored. Specify time series in cond for selecting specific time series.")
            }
        }
        
        # split x, and calculate average acf:
        splitdat <- split(x, f = split_by, drop = T)
        acfn <- lapply(splitdat, FUN = function(x) {
            x.rmna <- x[!is.na(x)]
            return(length(x.rmna))
        })
        splitacf <- lapply(splitdat, FUN = function(x) {
            x.rmna <- x[!is.na(x)]
            return(acf(x.rmna, plot = F)$acf)
        })
        len <- max_lag
        if (is.null(len)) {
            len <- max(unlist(lapply(splitacf, FUN = length)), na.rm = T)
        }
        splitacf <- lapply(splitacf, FUN = function(x, max = len) {
            if (length(x) < max) {
                return(c(x, rep(NA, max - length(x))))
            } else if (length(x) > max) {
                return(x[1:max])
            } else {
                return(x)
            }
        })
        # create wide format dfr:
        allacf <- as.data.frame(do.call("rbind", splitacf), stringsAsFactors = FALSE)
        names(allacf) <- (1:ncol(allacf)) - 1
        avgacf <- apply(allacf, 2, FUN = fun)  #apply(allacf, 2, FUN=fun, na.rm=T)
        
        if (plot) {
            # set plot arguments
            plot_default <- list(main = sprintf("ACF of %s", xname), xlab = "Lag", ylab = ifelse(plotci == 
                TRUE, "ACF function (per time series)", "ACF"), ylim = c(min(min(avgacf, na.rm = TRUE), 0), 
                max(max(avgacf, na.rm = TRUE), 1)), col = "black", type = "h")
            
            plot_args <- list(...)
            
            # merge plot info
            plot_args_def <- c()
            for (pa in names(plot_default)[!names(plot_default) %in% names(plot_args)]) {
                value <- plot_default[[pa]]
                if (length(value) > 1) {
                  value <- sprintf("c(%s)", paste(value, collapse = ","))
                  plot_args_def <- c(plot_args_def, paste(pa, value, sep = "="))
                } else {
                  plot_args_def <- c(plot_args_def, paste(pa, ifelse(is.character(value), sprintf("'%s'", 
                    value), value), sep = "="))
                }
            }
            
            if (length(plot_args_def) > 0) {
                eval(parse(text = paste("plot(0:(len-1), avgacf, ", paste(plot_args_def, collapse = ","), 
                  ", ...)")))
            } else {
                plot(0:(len - 1), avgacf, ...)
            }
            if (plotci) {
                ci <- -(1/acfn[[1]]) + 2/sqrt(acfn[[1]])
                abline(h = c(-1, 1) * ci, lty = 2, col = "blue")
            }
            # }else{ tmpn <- length(allacf[!is.na(allacf)]) ci <- -(1/tmpn)+2/sqrt(tmpn) abline(h=c(-1,1)*ci, lty=2,
            # col='blue') }
            abline(h = 0)
        }
        
        # set output:
        acf_out <- avgacf
        if (return_all) {
            # create long format dfr:
            dfracf <- do.call("rbind", mapply(function(x, y, z) {
                data.frame(acf = x, lag = 0:(length(x) - 1), n = rep(y, length(x)), event = rep(z, length(x)), 
                  stringsAsFactors = FALSE)
            }, splitacf, acfn, names(splitacf), SIMPLIFY = FALSE, USE.NAMES = FALSE))
            dfracf$ci <- -(1/dfracf$n) + 2/sqrt(dfracf$n)
            # add event info:
            events <- as.data.frame(split_by, stringsAsFactors = FALSE)
            events$event <- apply(events, 1, function(x) {
                gsub(" ", "", paste(x, collapse = "."), fixed = TRUE)
            })
            events <- events[!duplicated(events), ]
            dfracf <- merge(dfracf, events, by = "event", all.x = TRUE, all.y = FALSE)
            acfn <- do.call("rbind", lapply(names(acfn), function(x) {
                data.frame(n = acfn[[x]], event = x, stringsAsFactors = FALSE)
            }))
            
            acf_out <- list(acf = avgacf, acftable = allacf, dataframe = dfracf, n = acfn, series = deparse(substitute(x)), 
                FUN = fun)
            
        }
        invisible(acf_out)
    } else {
        stop(sprintf("Not sufficient data to plot ACF: %s has %d elements.\n", deparse(substitute(x)), length(x)))
    }
}





#' Generate an ACF plot of model residuals. Works for lm, lmer, gam, bam, ....
#' 
#' @description Wrapper around \code{\link{acf_plot}} and 
#' \code{\link{acf_n_plots}} for regression models.
#' @export
#' @import mgcv
#' @import stats
#' @param model A regression model generated by \code{lm}, \code{glm},
#' \code{lmer}, \code{glmer}, \code{\link[mgcv]{gam}}, 
#' or \code{\link[mgcv]{bam}}.
#' (See examples for how to avoid errors due to missing values.)
#' @param split_pred Vector with names of model predictors that determine
#' the time series in the data, or should be used to split the ACF plot by.
#' Alternatively, \code{split_pred} can be a named list as being used by 
#' \code{\link{acf_plot}} and \code{\link{acf_n_plots}}.
#' Yet another option is to provide the text string 'AR.start', for a model 
#' that includes an AR1 model. The events are derived from the AR.start column 
#' if that is provided.
#' @param n The number of plots to generate. If \code{n}=1 (default) then 
#' \code{\link{acf_plot}} is being called. If \code{n}>1 then 
#' \code{\link{acf_n_plots}} is being called.
#' @param plot Logical: whether or not to produce plot. Default is TRUE.
#' @param check.rho Numeric value: Generally leave at NULL. This value does 
#' not change anything, but it is used to check whether the model's AR1 
#' coefficient matches the expected value of rho. 
#' @param main Text string, title of plot.
#' @param ... Other arguments as input for \code{\link{acf_plot}} 
#' or \code{\link{acf_n_plots}}.
#' @return An aggregated ACF plot and / or optionally a list with the aggregated ACF values.
#' @author Jacolien van Rij
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_plot}}, or \code{\link{acf_n_plots}}.
#' @examples
#' data(simdat)
#'
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' 
#' \dontrun{
#' # Run GAMM model:
#' m1 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat)
#'
#' # Using a list to split the data:
#' acf_resid(m1, split_pred=list(simdat$Subject, simdat$Trial))
#' # ...or using model predictors:
#' acf_resid(m1, split_pred=c('Subject', 'Trial'))
#' 
#' # Calling acf_n_plots:
#' acf_resid(m1, split_pred=c('Subject', 'Trial'), n=4)
#' # add some arguments:
#' acf_resid(m1, split_pred=c('Subject', 'Trial'), n=4, max_lag=10)
#' 
#' # This does not work...
#' m2 <- lm(Y ~ Time, data=simdat)
#' acf_resid(m2, split_pred=c('Subject', 'Trial'))
#' # ... but this is ok:
#' acf_resid(m2, split_pred=list(simdat$Subject, simdat$Trial))
#' 
#' # Using AR.start column:
#' simdat <- start_event(simdat, event=c('Subject', 'Trial'))
#' r1 <- start_value_rho(m1)
#' m3 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat, 
#'     rho=r1, AR.start=simdat$start.event)
#' acf_resid(m3, split_pred='AR.start')
#' # this is the same:
#' acf_resid(m3, split_pred=c('Subject', 'Trial'))
#' # Note: use model comparison to find better value for rho
#' }
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#' @family functions for model criticism
acf_resid <- function(model, split_pred = NULL, n = 1, plot = TRUE, check.rho = NULL, main = NULL, ...) {
    split_by = NULL
    res <- NULL
    if (!is.null(check.rho)) {
        if ("AR1.rho" %in% names(model)) {
            mess <- "AR.start is not specified."
            if ("(AR.start)" %in% names(model$model)) {
                mess <- ""
            }
            if (model$AR1.rho != check.rho) {
                message(sprintf("The model's value of rho %f does not match the expected value %f. %s", model$AR1.rho, 
                  check.rho, mess))
            } else {
                message(sprintf("The model's value of rho matches the expected value %f. %s", check.rho, mess))
            }
        } else {
            if (!inherits(model, "bam")) {
                message("No value for rho specified in model. Use function bam() to specify rho.")
            } else if ("(AR.start)" %in% names(model$model)) {
                message("AR.start is specified, but no value set for rho.")
            } else {
                message("Argument rho and AR.start not set in this model.")
            }
        }
    }
    if (!is.null(split_pred)) {
        
        split_by = list()
        if (!is.list(split_pred)) {
            # extract time series data from model:
            dat <- NULL
            if ("lm" %in% class(model)) {
                dat <- model$model
            } else if ("lmerMod" %in% class(model)) {
                dat <- model@frame
            }
            if (length(split_pred) == 1 & split_pred[1] %in% c("AR.start", "start.event")) {
                if (split_pred %in% colnames(dat)) {
                  if (is.logical(dat[, split_pred])) {
                    split_by[[split_pred]] = derive_timeseries(dat[, split_pred])
                  } else {
                    split_by[[split_pred]] = dat[, split_pred]
                  }
                } else {
                  split_by[[split_pred]] = derive_timeseries(model)
                }
            } else if (!all(split_pred %in% colnames(dat))) {
                notindata <- paste(split_pred[!split_pred %in% colnames(dat)], collapse = ", ")
                stop(sprintf("split-pred value(s) %s is / are not included as predictor in the model.", notindata))
            } else {
                for (i in split_pred) {
                  split_by[[i]] <- as.vector(dat[, i])
                }
            }
        } else {
            split_by <- split_pred
            # check for missing data
            me <- missing_est(model)
            if (!is.null(me)) {
                split_by <- lapply(split_pred, function(x) {
                  return(x[-me])
                })
            }
        }
    }
    if (n > 1) {
        if (!all(is.na(resid_gam(model, incl_na = TRUE)))) {
            if (is.null(split_pred)) {
                out <- acf_n_plots(resid_gam(model), split_by = split_by, n = n, plot = plot, ...)
                invisible(out)
                # }else if( !is.list(split_pred)){
            } else if (("(AR.start)" %in% colnames(model$model)) || (("AR1.rho" %in% names(model)) && model$AR1.rho > 
                0)) {
                out <- acf_n_plots(resid_gam(model, incl_na = TRUE), split_by = split_by, n = n, plot = plot, 
                  ...)
                invisible(out)
            } else {
                out <- acf_n_plots(resid_gam(model), split_by = split_by, n = n, plot = plot, ...)
                invisible(out)
            }
        }
    } else {
        if (!all(is.na(resid_gam(model, incl_na = TRUE)))) {
            if (is.null(main)) {
                main = ifelse(is.null(split_pred), sprintf("ACF resid_gam(%s)", deparse(substitute(model))), 
                  "ACF Average")
            }
            if (is.null(split_pred)) {
                out <- acf_plot(resid_gam(model), split_by = split_by, plot = plot, main = main, ...)
                invisible(out)
                # }else if( !is.list(split_pred)){
            } else if (("(AR.start)" %in% colnames(model$model)) || (("AR1.rho" %in% names(model)) && model$AR1.rho > 
                0)) {
                out <- acf_plot(resid_gam(model, incl_na = TRUE), split_by = split_by, plot = plot, main = main, 
                  ...)
                invisible(out)
            } else {
                out <- acf_plot(resid_gam(model), split_by = split_by, plot = plot, main = main, ...)
                invisible(out)
            }
        }
    }
}





#' Derive the time series used in the AR1 model.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @import plotfunctions
#' @param model GAMM model that includes an AR1 model.
#' @param AR.start Vector with AR.start information, 
#' necessary for the AR1 model. Optional, defaults to NULL.
#' @return A vector with time series indication based on the AR1 model.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' 
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' simdat <- start_event(simdat, event=c('Subject', 'Trial'))
#' 
#' \dontrun{
#' # Run GAMM model:
#' m1 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat, 
#'     rho=.5, AR.start=simdat$start.event)
#' simdat$Event <- NA
#' simdat[!is.na(simdat$Y),]$Event <- derive_timeseries(m1)
#' acf_resid(m1, split_pred=list(Event=simdat$Event))
#' 
#' # And this works too:
#' simdat$Event <- derive_timeseries(simdat$start.event)
#' acf_resid(m1, split_pred=list(Event=simdat$Event))
#' 
#' # Note that acf_resid automatically makes use of derive_timeseries:
#' acf_resid(m1, split_pred='AR.start')
#' }
#' @family functions for model criticism
derive_timeseries <- function(model, AR.start = NULL) {
    tmpdat <- c()
    na.el <- c()
    if (inherits(model, "lm")) {
        tmpdat <- model$model
        na.el <- missing_est(model)
    } else if (inherits(model, "lmerMod")) {
        tmpdat <- model@frame
        na.el <- missing_est(model)
    } else if (inherits(model, "logical")) {
        tmpdat <- data.frame(val = model, stringsAsFactors = FALSE)
        names(tmpdat) <- "(AR.start)"
        na.el <- which(is.na(model))
    }
    if (!"(AR.start)" %in% names(tmpdat)) {
        if (is.null(AR.start)) {
            stop("Model does not contain an AR1 model. Please specify AR.start information.")
        } else {
            if (length(AR.start) == nrow(tmpdat)) {
                tmpdat[, "(AR.start)"] <- AR.start
            } else if (length(AR.start) == (nrow(tmpdat) + length(na.el))) {
                tmpdat[, "(AR.start)"] <- AR.start[-na.el]
            } else {
                stop(sprintf("AR.start has more values (%f) than observations used in the model (%f).", length(AR.start), 
                  nrow(tmpdat)))
            }
        }
    } else {
        if (!is.null(AR.start)) {
            warning(sprintf("Model %s includes AR1 model, AR.start will be ignored.", deparse(substitute(model))))
        }
    }
    starts <- which(tmpdat[, "(AR.start)"] == TRUE)
    ends <- move_n_point(starts - 1, n = -1)
    ends[length(ends)] <- nrow(tmpdat)
    return(as.factor(rep(1:length(starts), ends - starts + 1)))
}





#' Extract model residuals and remove the autocorrelation accounted for. 
#' 
#' @export
#' @import mgcv
#' @import stats
#' @aliases resid.gam
#' @param model A GAMM model build with \code{\link[mgcv]{gam}} or 
#' \code{\link[mgcv]{bam}}.
#' @param AR_start Optional: vector with logicals, indicating the start of 
#' events. 
#' Default is NULL, because generally the function can retrieve all necessary 
#' information from the model.
#' @param incl_na Whether or not to include missing values (NA)when returning 
#' the residuals. Defaults to FALSE.
#' @param return_all Default is FALSE. Returns a list with normal residuals, 
#' corrected residuals, and the value of rho.
#' @return Corrected residuals.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' 
#' \dontrun{
#' # Add start event column:
#' simdat <- start_event(simdat, event=c('Subject', 'Trial'))
#' head(simdat)
#' # bam model with AR1 model (toy example, not serious model):
#' m1 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat, rho=.5, AR.start=simdat$start.event)
#' # Standard residuals:
#' res1 <- resid(m1)
#' # Corrected residuals:
#' res2 <- resid_gam(m1)
#' 
#' # Result in different ACF's:
#' par(mfrow=c(1,2))
#' acf(res1)
#' acf(res2)
#'
#' # Without AR.start included in the model:
#' m2 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat)
#' acf(resid_gam(m2), plot=F)
#' # Same as resid(m2)!
#' acf(resid(m2), plot=F)
#'
#' ### MISSING VALUES ###
#' # Note that corrected residuals cannot be calculated for the last 
#' # point of each time series. These missing values are by default
#' # excluded.
#'
#' # Therefore, this will result in an error...
#' simdat$res <- resid_gam(m1)
#' # ... and this will give an error too:
#' simdat$res <- NA
#' simdat[!is.na(simdat$Y),] <- resid_gam(m1)
#' # ... but this works:
#' simdat$res <- resid_gam(m1, incl_na=TRUE)
#' 
#' # The parameter incl_na will NOT add missing values
#' # for missing values in the *data*. 
#' # Example:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' # Without AR.start included in the model:
#' m2 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat)
#' # This works:
#' acf(resid_gam(m2))
#' # ...but this results in error, although no AR1 model specified:
#' simdat$res <- resid_gam(m2)
#' # ... for this type of missing data, this does not solve the problem:
#' simdat$res <- resid_gam(m2, incl_na=TRUE)
#' # instead try this:
#' simdat$res <- NA
#' simdat[-missing_est(m2),]$res <- resid_gam(m2)
#' 
#' # With AR.start included in the model:
#' m1 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat, rho=.5, AR.start=simdat$start.event)
#' # This works (incl_na=TRUE):
#' simdat$res <- NA
#' simdat[-missing_est(m2),]$res <- resid_gam(m2, incl_na=TRUE)
#'
#' }
#' @seealso \code{\link[stats]{resid}}, \code{\link{missing_est}}
#' @family functions for model criticism
resid_gam <- function(model, AR_start = NULL, incl_na = FALSE, return_all = FALSE) {
    
    # help function
    next_point <- function(x) {
        x <- as.vector(x)
        
        if (length(x) > 1) 
            return(c(x[2:length(x)], NA)) else if (length(x) == 1) 
            return(NA) else return(NULL)
    }
    
    # extract time series data from model:
    tmpdat <- NULL
    n <- 1
    rho <- 0
    if ("lm" %in% class(model)) {
        tmpdat <- model$model
        # check AR_start:
        if (is.null(tmpdat$"(AR.start)")) {
            tmpdat$"(AR.start)" <- rep(FALSE, nrow(tmpdat))
            tmpdat[1, ]$"(AR.start)" <- TRUE
            
            if (!is.null(model$AR1.rho)) {
                if (model$AR1.rho > 0) {
                  if (is.null(AR_start)) {
                    warning("No values for AR_start found in model specification. Please provide values for AR_start as argument to this function if you have run the model with an older version of mgcv (< 1.7.28).")
                  } else {
                    warning(sprintf("Values for argument AR_start may not be specified, although an AR1 model rho was included in model %s (rho = %f).", 
                      deparse(substitute(model)), model$AR1.rho))
                    tmpdat$"(AR.start)" <- AR_start
                  }
                }
            }
        }
        
        # additional check of AR.start:
        if ((!TRUE %in% unique(tmpdat$"(AR.start)")) | (!FALSE %in% unique(tmpdat$"(AR.start)"))) {
            stop("No event starts found in AR_start argument.\nImportant: check the values of the AR_start argument, add at least one event start (value TRUE), and run the model again.")
        }
        
        n <- which(tmpdat$"(AR.start)")
        if (is.null(model$AR1.rho)) {
            # warning('No rho specified in model. Assumed rho to equal 0.')
            model[["AR1.rho"]] <- 0
            rho <- 0
        } else {
            rho <- model$AR1.rho
        }
    } else if ("lmerMod" %in% class(model)) {
        tmpdat <- model@frame
        rho <- 0
    } else {
        stop(sprintf("Function does not work for models of class %s.", class(model)[1]))
    }
    
    
    if (nrow(tmpdat) == length(resid(model))) {
        tmpdat$RES <- resid(model)
    } else {
        tmpdat$RES <- NA
        if (nrow(tmpdat) == length(resid(model))) {
            tmpdat$RES <- resid(model)
        } else {
            tmpdat[!(1:nrow(tmpdat)) %in% model$na.action, ]$RES <- resid(model)
        }
    }
    
    tmpdat$RES_next <- next_point(tmpdat$RES)
    if (length(n[(n - 1) > 0]) > 0) {
        tmpdat[n[(n - 1) > 0], ]$RES_next <- rep(NA, length(n[(n - 1) > 0]))
    }
    
    res <- tmpdat$RES_next - rho * tmpdat$RES
    if (return_all) {
        return(list(res = tmpdat$RES, norm_res = res, AR1_rho = rho))
    } else {
        if (rho == 0) {
            res <- tmpdat$RES
        }
        if (incl_na) {
            return(res)
        } else {
            return(res[!is.na(res)])
        }
    }
}





#' Determine the starting point for each time series.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @param data A data frame.
#' @param column Test string, name of the column that describes the order 
#' withing the time series. Default is 'Time'.
#' @param event A text string or vector indicating the columns that define the 
#' unique time series. Default is 'Event'.
#' @param label The name of the new column with the start point of each time 
#' series. Default is 'start.event'.
#' @param label.event In case \code{event} is not a single column, providing a 
#' text string will add a column with this name that defines unique time 
#' series. Default is NULL (no new column for time series is created).
#' @param order Logical: whether or not to order each time series. 
#' Default is TRUE, maybe set to FALSE with large data frames that are already ordered.
#' @param newcode Logical: whether or not to use the new (and hopefully faster) code.
#' @section Note:
#' When working with large data frames, it may be worth installing the package 
#' \code{data.table}. Although not required for the package, the function 
#' \code{start_event} will check if \code{data.table} is available and will 
#' use it's much faster function \code{rbindlist}. This speeds up the function 
#' \code{start_event}. Run the command 
#' \code{install.packages('data.table', repos='http://cran.us.r-project.org')} 
#' in the command window for installing the package \code{data.table}.
#' @return Data frame.
#' @author Jacolien van Rij
#' @examples 
#' data(simdat)
#' head(simdat)
#' simdat <- simdat[sample(1:nrow(simdat)),]
#' simdat$Condition <- relevel(factor(simdat$Condition), ref="0")
#' contrasts(simdat$Condition) <- "contr.poly"
#' contrasts(simdat$Condition)
#' test <- start_event(simdat, event=c('Subject', 'Trial'), label.event='Event') 
#' contrasts(test$Condition)
#' head(test)
#' test <- start_event(simdat, event="Subject")
#' test <- start_event(simdat, event=c('Subject', 'Trial'))
#' @family functions for model criticism
start_event <- function(data, column = "Time", event = "Event", label = "start.event", label.event = NULL, 
    order = TRUE, newcode = TRUE) {
    if (is.null(label)) {
        stop("No output column specified in argument 'label'.")
    }
    if (!column %in% names(data)) {
        stop(sprintf("No column '%s' in data frame %s.", column, deparse(substitute(data))))
    }
    if (!all(event %in% names(data))) {
        el <- paste(which(event[!event %in% names(data)]), collapse = ",")
        stop(sprintf("Column name(s) '%s' not found in data frame %s.", el, deparse(substitute(data))))
    }
    if (length(column) > 1) {
        warning(sprintf("Argument column has %d elements. Only first element is used.", length(column)))
        column <- column[1]
    }
    if (label %in% names(data)) {
        warning(sprintf("Column %s already exists, will be overwritten.", label))
    }
    if (!is.null(label.event)) {
        if (label.event %in% names(data)) {
            warning(sprintf("Column %s already exists, will be overwritten.", label.event))
        }
    }
    

    if(newcode){
        if (!is.null(label.event) & length(event) > 1) {
            data[, label.event] <- interaction(data[, event], drop=TRUE)  
            if(order){
                data <- data[order(data[, label.event], data[, column]),]
            }
            check <- tapply(data[,column], list(data[, label.event]), 
                function(x){return(min(x, na.rm=TRUE))})   
            data[, label] <- data[, column] == check[data[,label.event]]     
        } else if (length(event) > 1) {
            dummy.event.label = paste(c("tmpcolumn_", sample(c(letters, as.character(0:9), "_", "."), 
                size=100, replace=TRUE)), collapse="")
            data[, dummy.event.label] <- interaction(data[, event], drop=TRUE)
            if(order){
                data <- data[order(data[, dummy.event.label], data[, column]),]
            }  
            check <- tapply(data[,column], list(data[, dummy.event.label]), 
                function(x){return(min(x, na.rm=TRUE))})   
            data[, label] <- data[, column] == check[data[,dummy.event.label]] 
            data[, dummy.event.label] <- NULL  
        } else {
            if(order){
                data <- data[order(data[, event], data[, column]),]
            }
            check <- tapply(data[,column], list(data[, event]), 
                function(x){return(min(x, na.rm=TRUE))})   
            data[, label] <- data[, column] == check[data[,event]] 
        }
        return(data)
    } else {
    ## OLD CODE:
        # store reference levels:
        ref <- refLevels(data)
        tmp <- NULL
        if (!is.null(label.event) & length(event) > 1) {
            data[, label.event] <- interaction(data[, event])
            tmp <- split(data, f = list(data[, label.event]), drop = TRUE)
        } else if (length(event) > 1) {
            tmp <- split(data, f = as.list(data[, event]), drop = TRUE)
        } else {
            tmp <- split(data, f = list(data[, event]), drop = TRUE)
        }
        if (order) {
            tmp <- lapply(tmp, function(x) {
                min.x <- min(x[, column])
                x[, label] <- x[, column] == min.x
                x <- x[order(x[, column]), ]
                return(x)
            })
        } else {
            tmp <- lapply(tmp, function(x) {
                min.x <- min(x[, column])
                x[, label] <- x[, column] == min.x
                return(x)
            })
        }
        if (requireNamespace("data.table", quietly = TRUE)) {
            tmp <- as.data.frame(data.table::rbindlist(tmp), stringsAsFactors = FALSE)
        } else {
            if(getOption("itsadug_print")){
                message("Package data.table is not installed. This package may speed up the function start_event.")
            }
            tmp <- do.call("rbind", tmp)
        }
        row.names(tmp) <- NULL
        # put same reference levels back:
        for(i in names(ref)){
            tmp[,i] <- relevel(as.factor(tmp[,i]), ref=ref[[i]][['ref']])
            contrasts(tmp[,i]) <- ref[[i]][['contr']]
        }
        return(tmp)
    }
}





#' Extract the Lag 1 value from the ACF of the residuals of a gam, bam, lm, 
#' lmer model, ...
#' 
#' @description Wrapper around \code{\link{acf_plot}} for regression models.
#' @export
#' @import mgcv
#' @import stats
#' @param model A regression model generated by \code{lm}, \code{glm},
#' \code{lmer}, \code{glmer}, \code{\link[mgcv]{gam}}, 
#' or \code{\link[mgcv]{bam}}.
#' (See examples for how to avoid errors due to missing values.)
#' @param plot Logical: whether or not to produce plot. Default is TRUE.
#' @param lag Numeric value, indicating the lag. Default is 2. 
#' @param main Text string, title of ACF plot.
#' @param ... Other arguments for plotting the acf, see 
#' \code{\link[stats]{acf}}.
#' @return The autocorrelation value of data points with the data points 
#' at lag \code{lag}.
#' @author Jacolien van Rij
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_plot}}, or \code{\link{acf_resid}}.
#' @examples
#' data(simdat)
#'
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' 
#' \dontrun{
#' # Run GAMM model:
#' m1 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat)
#'
#' # No plotting:
#' start_value_rho(m1)
#' # With plot:
#' rhom1 <- start_value_rho(m1, plot=TRUE)
#' 
#' }
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#' @family functions for model criticism
#' 
start_value_rho <- function(model, plot = FALSE, lag = 2, main = NULL, ...) {
    rval <- acf(resid(model), plot = plot, main = ifelse(is.null(main), sprintf("Series resid(%s)", deparse(substitute(model))), 
        main), ...)$acf[lag]
    if (plot) {
        par <- list(...)
        f <- par()$lwd
        if ("lwd" %in% names(par)) {
            f <- par[["lwd"]]
        }
        points(lag - 1, rval, col = "red", type = "h", lwd = 1.5 * f)
        abline(h = rval, col = "red", lty = 3)
    }
    return(rval)
}
Any scripts or data that you put into this service are public.
itsadug documentation built on June 17, 2022, 5:05 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
itsadug
Interpreting Time Series and Autocorrelated Data Using GAMMs

R/acf.R
In itsadug: Interpreting Time Series and Autocorrelated Data Using GAMMs

Defines functions start_value_rho start_event resid_gam derive_timeseries acf_resid acf_plot acf_n_plots

Documented in acf_n_plots acf_plot acf_resid derive_timeseries resid_gam start_event start_value_rho

Try the itsadug package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

itsadug Interpreting Time Series and Autocorrelated Data Using GAMMs

R/acf.R In itsadug: Interpreting Time Series and Autocorrelated Data Using GAMMs

Defines functions start_value_rho start_event resid_gam derive_timeseries acf_resid acf_plot acf_n_plots

Documented in acf_n_plots acf_plot acf_resid derive_timeseries resid_gam start_event start_value_rho

Try the itsadug package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

itsadug
Interpreting Time Series and Autocorrelated Data Using GAMMs

R/acf.R
In itsadug: Interpreting Time Series and Autocorrelated Data Using GAMMs