R/acf.R

Defines functions start_value_rho start_event resid_gam derive_timeseries acf_resid acf_plot acf_n_plots

Documented in acf_n_plots acf_plot acf_resid derive_timeseries resid_gam start_event start_value_rho

#' Generate N ACF plots of individual or aggregated time series.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @import graphics
#' @param x A vector with time series data, typically residuals of a 
#' regression model.
#' @param n The number of plots to generate.
#' @param split_by List of vectors (each with equal length as \code{x}) that 
#' group the values of \code{x} into trials or timeseries events. 
#' Generally other columns from the same data frame.
#' @param cond Named list with a selection of the time series events
#' specified in \code{split_by}. Default is NULL, indicating that 
#' all time series are being processed, rather than a selection.
#' @param max_lag Maximum lag at which to calculate the acf. 
#' Default is the maximum for the longest time series.
#' @param fun The function used when aggregating over time series 
#' (depending on the value of \code{split_by}).
#' @param random Logical: determine randomly which \code{n} (aggregated) time 
#' series are plotted, or use the \code{\link{quantile}} function to find a 
#' range of different time series to plot. Default is FALSE (not random).
#' @param mfrow A vector of the form c(nr, nc). The figures will be drawn in 
#' an nr-by-nc array on the device by rows.
#' @param add Logical: whether to add the plots to an exiting plot window or 
#' not. 
#' Default is FALSE.
#' @param plot Logical: whether or not to produce plot. Default is TRUE.
#' @param print.summary Logical: whether or not to print summary.
#' Default set to the print info messages option 
#' (see \code{\link{infoMessages}}).
#' @param ... Other arguments for plotting, see \code{\link[graphics]{par}}.
#' @return \code{n} ACF plots providing information about the autocorrelation 
#' in \code{x}.
#' @author Jacolien van Rij, R. Harald Baayen
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_plot}} for an ACF that takes into account time series 
#' in the data.
#' @examples
#' data(simdat)
#' 
#' # Separate ACF for each time series:
#' acf_n_plots(simdat$Y, split_by=list(simdat$Subject, simdat$Trial))
#' 
#' # Average ACF per participant:
#' acf_n_plots(simdat$Y, split_by=list(simdat$Subject))
#' 
#' \dontrun{
#' # Data treated as single time series. Plot is added to current window.
#' # Note: 1 time series results in 1 plot.
#' acf_n_plots(simdat$Y, add=TRUE)
#' # Plot 4 ACF plots doesn't work without splitting data:
#' acf_n_plots(simdat$Y, add=TRUE, n=4)
#' 
#' # Plot ACFs of 4 randomly selected time series:
#' acf_n_plots(simdat$Y, random=TRUE, n=4, add=TRUE, 
#'     split_by=list(simdat$Subject, simdat$Trial))
#'
#' }
#'
#' #---------------------------------------------
#' # When using model residuals
#' #---------------------------------------------
#' 
#' \dontrun{
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' # simple linear model:
#' m1 <- lm(Y ~ Time, data=simdat)
#'
#' # This will generate an error:
#' # acf_n_plots(resid(m1), split_by=list(simdat$Subject, simdat$Trial))
#'
#' # This should work:
#' el.na <- missing_est(m1)
#' acf_n_plots(resid(m1), 
#'      split_by=list(simdat[-el.na,]$Subject, simdat[-el.na,]$Trial))
#'
#' # This should also work:
#' simdat$res <- NA
#' simdat[!is.na(simdat$Y),]$res <- resid(m1)
#' acf_n_plots(simdat$res, split_by=list(simdat$Subject, simdat$Trial))
#' }
#' 
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#' @family functions for model criticism
acf_n_plots <- function(x, n = 5, split_by = NULL, cond = NULL, max_lag = NULL, fun = mean, plot = TRUE, random = F, 
    mfrow = NULL, add = FALSE, print.summary = getOption("itsadug_print"), ...) {
    
    # get acf data:
    suppressWarnings(acfdat <- acf_plot(x, split_by = split_by, cond = cond, max_lag = max_lag, fun = fun, 
        plot = FALSE, return_all = TRUE))
    if (!nrow(acfdat$acftable) >= n) {
        warning(sprintf("Number of time series in the data (%d) is smaller than n (%d). N is reduced to %d.\n", 
            nrow(acfdat$acftable), n, nrow(acfdat$acftable)))
        n <- nrow(acfdat$acftable)
    }
    # get lag 1A vector:
    lag1.all <- acfdat$acftable$"1"
    rn <- rownames(acfdat$acftable)
    lag1 <- lag1.all[!is.na(lag1.all)]
    nevents <- acfdat$n
    findNum <- rep(1, n)
    out <- list()
    
    if (random) {
        findNum <- sample(nrow(acfdat$acftable), size = n)
    } else {
        q <- quantile(lag1, probs = seq(0, 1, length = n))
        
        findClosestElement <- function(el, vals, num = TRUE) {
            y <- abs(vals - el)
            y <- min(y)
            if (num) {
                return(which(abs(vals - el) == y)[1])
            } else {
                return(vals[which(abs(vals - el) == y)[1]])
            }
        }
        
        findNum <- c()
        for (i in 1:n) {
            findNum <- c(findNum, findClosestElement(q[i], lag1))
            if (i > 1) {
                out[[i - 1]] <- list(quantile = c(q[i - 1], q[i]), elements = data.frame(event = rn[which(lag1.all >= 
                  q[i - 1] & lag1.all < q[i])], lag1 = lag1.all[which(lag1.all >= q[i - 1] & lag1.all < q[i])], 
                  stringsAsFactors = FALSE))
            }
        }
        out[["quantiles"]] <- q
        if (print.summary) {
            cat("Quantiles to be plotted:\n")
            print(q)
        }
    }
    
    if (plot) {
        cols <- ceiling(sqrt(n))
        rows <- ceiling(n/cols)
        
        if (!is.null(mfrow)) {
            cols = mfrow[2]
            rows = mfrow[1]
        }
        
        if (add == FALSE) {
            # dev.new(width = cols * 3, height = rows * 3)
            par(mfrow = c(rows, cols))
            oldmar <- par(mar = rep(3, 4))
        }
        parlist = list(...)
        if ("type" %in% names(parlist)) {
            parlist[["type"]] <- NULL
        }
        xlab <- "Lag"
        ylab <- "ACF"
        ylim <- range(acfdat$acftable[findNum, ], na.rm = T)
        main.new <- NULL
        if ("xlab" %in% names(parlist)) {
            xlab <- parlist[["xlab"]]
            parlist[["xlab"]] <- NULL
        }
        if ("ylab" %in% names(parlist)) {
            ylab <- parlist[["ylab"]]
            parlist[["ylab"]] <- NULL
        }
        if ("ylim" %in% names(parlist)) {
            ylim <- parlist[["ylim"]]
            parlist[["ylim"]] <- NULL
        }
        if ("main" %in% names(parlist)) {
            main.new <- parlist[["main"]]
            parlist[["main"]] <- NULL
        }
        other <- paste(sprintf("%s=%s", names(parlist), parlist), collapse = ",")
        
        for (j in 1:min(n, nrow(acfdat$acftable))) {
            main <- NULL
            if (is.null(main.new)) {
                main <- paste("ACF of", row.names(acfdat$acftable[findNum[j], ]))
            } else {
                if (length(main.new) >= j) {
                  main <- main.new[j]
                } else {
                  main <- main.new[1]
                }
            }
            eval(parse(text = sprintf("plot(as.numeric(colnames(acfdat$acftable)), acfdat$acftable[findNum[j], ], type = 'h', 
                main=main, xlab=xlab, ylab=ylab, ylim = ylim, %s)", 
                other)))
            ci <- -(1/nevents[findNum[j], "n"]) + 2/sqrt(nevents[findNum[j], "n"])
            abline(h = c(-1, 1) * ci, lty = 2, col = "blue")
            abline(h = 0)
        }
    }
    invisible(out)
}





#' Generate an ACF plot of an aggregated time series.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @import graphics
#' @param x A vector with time series data, typically residuals of a 
#' regression model. 
#' (See examples for how to avoid errors due to missing values.)
#' @param split_by List of vectors (each with equal length as \code{x}) that 
#' group the values of \code{x} into trials or timeseries events. 
#' Generally other columns from the same data frame.
#' @param max_lag Maximum lag at which to calculate the acf. 
#' Default is the maximum for the longest time series.
#' @param cond Named list with a selection of the time series events
#' specified in \code{split_by}. Default is NULL, indicating that 
#' all time series are being processed, rather than a selection.
#' @param plot Logical: whether or not to plot the ACF. Default is TRUE.
#' @param fun The function used when aggregating over time series 
#' (depending on the value of \code{split_by}).
#' @param return_all Returning acfs for all time series.
#' @param ... Other arguments for plotting, see \code{\link[graphics]{par}}.
#' @return An aggregated ACF plot and / or optionally a list with the aggregated ACF values.
#' @author Jacolien van Rij
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_n_plots}} for inspection of individual time series.
#' @examples
#' data(simdat)
#'
#' # Default acf function:
#' acf(simdat$Y)
#' # Same plot with acf_plot:
#' acf_plot(simdat$Y)
#' # Average of ACFs per time series:
#' acf_plot(simdat$Y, split_by=list(simdat$Subject, simdat$Trial))
#' # Median of ACFs per time series:
#' acf_plot(simdat$Y, split_by=list(simdat$Subject, simdat$Trial), fun=median)
#'
#' # extract value of Lag1:
#' lag1 <- acf_plot(simdat$Y, 
#'    split_by=list(Subject=simdat$Subject, Trial=simdat$Trial), 
#'    plot=FALSE)['1']
#'
#' #---------------------------------------------
#' # When using model residuals
#' #---------------------------------------------
#' 
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' # simple linear model:
#' m1 <- lm(Y ~ Time, data=simdat)
#'
#' \dontrun{
#' # This will generate an error:
#' acf_plot(resid(m1), split_by=list(simdat$Subject, simdat$Trial))
#' }
#' # This should work:
#' el.na <- missing_est(m1)
#' acf_plot(resid(m1), 
#'      split_by=list(simdat[-el.na,]$Subject, simdat[-el.na,]$Trial))
#'
#' # This should also work:
#' simdat$res <- NA
#' simdat[!is.na(simdat$Y),]$res <- resid(m1)
#' acf_plot(simdat$res, split_by=list(simdat$Subject, simdat$Trial))
#'
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#'
#' @family functions for model criticism
acf_plot <- function(x, split_by = NULL, max_lag = NULL, plot = TRUE, fun = mean, cond = NULL, return_all = FALSE, 
    ...) {
    
    # check x:
    if (!is.vector(x)) {
        if (dim(x)[1] == 1) {
            x <- as.vector(x[1, ])
        } else if (dim(x)[2] == 1) {
            x <- as.vector(x[, 1])
        } else {
            stop(sprintf("%s is not a vector (dim: %d x %d).\n", deparse(substitute(x)), dim(x)[1], dim(x)[2]))
        }
    }
    xname <- deparse(substitute(x))
    plotci <- FALSE
    
    # check length x
    if (length(x) > 1) {
        # find missing values:
        n <- which(is.na(x))
        
        # check split factors:
        if (!is.null(split_by)) {
            for (i in 1:length(split_by)) {
                if (length(split_by[[i]]) != length(x)) {
                  name_split_i <- ifelse(!is.null(names(split_by)[i]), names(split_by)[i], sprintf("split_by[[%d]]", 
                    i))
                  errormessage <- sprintf("Split factor %s is not of same length as %s: %s has %d elements, %s has %d elements.\n
                    See help(acf_plot) for examples how to avoid this error.", 
                    name_split_i, deparse(substitute(x)), deparse(substitute(x)), length(x), name_split_i, 
                    length(split_by[[i]]))
                  stop(errormessage)
                }
            }
        } else {
            # warning(sprintf('No argument for split provided. %s is treated as a single time series.\n',
            # deparse(substitute(x))))
            split_by <- as.factor(rep("Factor", length(x)))
            plotci <- TRUE
        }
        # check condition:
        if (!is.null(cond)) {
            if (!is.null(split_by)) {
                el <- 1:length(split_by[[1]])
                for (i in names(cond)) {
                  if (i %in% names(split_by)) {
                    el <- intersect(el, which(split_by[[i]] %in% cond[[i]]))
                  } else {
                    warning(sprintf("Predictor %s not specified in split_by. cond will be ignored.", i))
                  }
                }
                if (length(el) > 0) {
                  x <- x[el]
                  for (i in names(split_by)) {
                    if (is.factor(split_by[[i]])) {
                      split_by[[i]] <- droplevels(split_by[[i]][el])
                    } else {
                      split_by[[i]] <- split_by[[i]][el]
                    }
                  }
                } else {
                  warning("Specified conditions not found in values of split_by. cond will be ignored.")
                }
                
            } else {
                warning("Split_by is empty, therefore cond will be ignored. Specify time series in cond for selecting specific time series.")
            }
        }
        
        # split x, and calculate average acf:
        splitdat <- split(x, f = split_by, drop = T)
        acfn <- lapply(splitdat, FUN = function(x) {
            x.rmna <- x[!is.na(x)]
            return(length(x.rmna))
        })
        splitacf <- lapply(splitdat, FUN = function(x) {
            x.rmna <- x[!is.na(x)]
            return(acf(x.rmna, plot = F)$acf)
        })
        len <- max_lag
        if (is.null(len)) {
            len <- max(unlist(lapply(splitacf, FUN = length)), na.rm = T)
        }
        splitacf <- lapply(splitacf, FUN = function(x, max = len) {
            if (length(x) < max) {
                return(c(x, rep(NA, max - length(x))))
            } else if (length(x) > max) {
                return(x[1:max])
            } else {
                return(x)
            }
        })
        # create wide format dfr:
        allacf <- as.data.frame(do.call("rbind", splitacf), stringsAsFactors = FALSE)
        names(allacf) <- (1:ncol(allacf)) - 1
        avgacf <- apply(allacf, 2, FUN = fun)  #apply(allacf, 2, FUN=fun, na.rm=T)
        
        if (plot) {
            # set plot arguments
            plot_default <- list(main = sprintf("ACF of %s", xname), xlab = "Lag", ylab = ifelse(plotci == 
                TRUE, "ACF function (per time series)", "ACF"), ylim = c(min(min(avgacf, na.rm = TRUE), 0), 
                max(max(avgacf, na.rm = TRUE), 1)), col = "black", type = "h")
            
            plot_args <- list(...)
            
            # merge plot info
            plot_args_def <- c()
            for (pa in names(plot_default)[!names(plot_default) %in% names(plot_args)]) {
                value <- plot_default[[pa]]
                if (length(value) > 1) {
                  value <- sprintf("c(%s)", paste(value, collapse = ","))
                  plot_args_def <- c(plot_args_def, paste(pa, value, sep = "="))
                } else {
                  plot_args_def <- c(plot_args_def, paste(pa, ifelse(is.character(value), sprintf("'%s'", 
                    value), value), sep = "="))
                }
            }
            
            if (length(plot_args_def) > 0) {
                eval(parse(text = paste("plot(0:(len-1), avgacf, ", paste(plot_args_def, collapse = ","), 
                  ", ...)")))
            } else {
                plot(0:(len - 1), avgacf, ...)
            }
            if (plotci) {
                ci <- -(1/acfn[[1]]) + 2/sqrt(acfn[[1]])
                abline(h = c(-1, 1) * ci, lty = 2, col = "blue")
            }
            # }else{ tmpn <- length(allacf[!is.na(allacf)]) ci <- -(1/tmpn)+2/sqrt(tmpn) abline(h=c(-1,1)*ci, lty=2,
            # col='blue') }
            abline(h = 0)
        }
        
        # set output:
        acf_out <- avgacf
        if (return_all) {
            # create long format dfr:
            dfracf <- do.call("rbind", mapply(function(x, y, z) {
                data.frame(acf = x, lag = 0:(length(x) - 1), n = rep(y, length(x)), event = rep(z, length(x)), 
                  stringsAsFactors = FALSE)
            }, splitacf, acfn, names(splitacf), SIMPLIFY = FALSE, USE.NAMES = FALSE))
            dfracf$ci <- -(1/dfracf$n) + 2/sqrt(dfracf$n)
            # add event info:
            events <- as.data.frame(split_by, stringsAsFactors = FALSE)
            events$event <- apply(events, 1, function(x) {
                gsub(" ", "", paste(x, collapse = "."), fixed = TRUE)
            })
            events <- events[!duplicated(events), ]
            dfracf <- merge(dfracf, events, by = "event", all.x = TRUE, all.y = FALSE)
            acfn <- do.call("rbind", lapply(names(acfn), function(x) {
                data.frame(n = acfn[[x]], event = x, stringsAsFactors = FALSE)
            }))
            
            acf_out <- list(acf = avgacf, acftable = allacf, dataframe = dfracf, n = acfn, series = deparse(substitute(x)), 
                FUN = fun)
            
        }
        invisible(acf_out)
    } else {
        stop(sprintf("Not sufficient data to plot ACF: %s has %d elements.\n", deparse(substitute(x)), length(x)))
    }
}





#' Generate an ACF plot of model residuals. Works for lm, lmer, gam, bam, ....
#' 
#' @description Wrapper around \code{\link{acf_plot}} and 
#' \code{\link{acf_n_plots}} for regression models.
#' @export
#' @import mgcv
#' @import stats
#' @param model A regression model generated by \code{lm}, \code{glm},
#' \code{lmer}, \code{glmer}, \code{\link[mgcv]{gam}}, 
#' or \code{\link[mgcv]{bam}}.
#' (See examples for how to avoid errors due to missing values.)
#' @param split_pred Vector with names of model predictors that determine
#' the time series in the data, or should be used to split the ACF plot by.
#' Alternatively, \code{split_pred} can be a named list as being used by 
#' \code{\link{acf_plot}} and \code{\link{acf_n_plots}}.
#' Yet another option is to provide the text string 'AR.start', for a model 
#' that includes an AR1 model. The events are derived from the AR.start column 
#' if that is provided.
#' @param n The number of plots to generate. If \code{n}=1 (default) then 
#' \code{\link{acf_plot}} is being called. If \code{n}>1 then 
#' \code{\link{acf_n_plots}} is being called.
#' @param plot Logical: whether or not to produce plot. Default is TRUE.
#' @param check.rho Numeric value: Generally leave at NULL. This value does 
#' not change anything, but it is used to check whether the model's AR1 
#' coefficient matches the expected value of rho. 
#' @param main Text string, title of plot.
#' @param ... Other arguments as input for \code{\link{acf_plot}} 
#' or \code{\link{acf_n_plots}}.
#' @return An aggregated ACF plot and / or optionally a list with the aggregated ACF values.
#' @author Jacolien van Rij
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_plot}}, or \code{\link{acf_n_plots}}.
#' @examples
#' data(simdat)
#'
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' 
#' \dontrun{
#' # Run GAMM model:
#' m1 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat)
#'
#' # Using a list to split the data:
#' acf_resid(m1, split_pred=list(simdat$Subject, simdat$Trial))
#' # ...or using model predictors:
#' acf_resid(m1, split_pred=c('Subject', 'Trial'))
#' 
#' # Calling acf_n_plots:
#' acf_resid(m1, split_pred=c('Subject', 'Trial'), n=4)
#' # add some arguments:
#' acf_resid(m1, split_pred=c('Subject', 'Trial'), n=4, max_lag=10)
#' 
#' # This does not work...
#' m2 <- lm(Y ~ Time, data=simdat)
#' acf_resid(m2, split_pred=c('Subject', 'Trial'))
#' # ... but this is ok:
#' acf_resid(m2, split_pred=list(simdat$Subject, simdat$Trial))
#' 
#' # Using AR.start column:
#' simdat <- start_event(simdat, event=c('Subject', 'Trial'))
#' r1 <- start_value_rho(m1)
#' m3 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat, 
#'     rho=r1, AR.start=simdat$start.event)
#' acf_resid(m3, split_pred='AR.start')
#' # this is the same:
#' acf_resid(m3, split_pred=c('Subject', 'Trial'))
#' # Note: use model comparison to find better value for rho
#' }
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#' @family functions for model criticism
acf_resid <- function(model, split_pred = NULL, n = 1, plot = TRUE, check.rho = NULL, main = NULL, ...) {
    split_by = NULL
    res <- NULL
    if (!is.null(check.rho)) {
        if ("AR1.rho" %in% names(model)) {
            mess <- "AR.start is not specified."
            if ("(AR.start)" %in% names(model$model)) {
                mess <- ""
            }
            if (model$AR1.rho != check.rho) {
                message(sprintf("The model's value of rho %f does not match the expected value %f. %s", model$AR1.rho, 
                  check.rho, mess))
            } else {
                message(sprintf("The model's value of rho matches the expected value %f. %s", check.rho, mess))
            }
        } else {
            if (!inherits(model, "bam")) {
                message("No value for rho specified in model. Use function bam() to specify rho.")
            } else if ("(AR.start)" %in% names(model$model)) {
                message("AR.start is specified, but no value set for rho.")
            } else {
                message("Argument rho and AR.start not set in this model.")
            }
        }
    }
    if (!is.null(split_pred)) {
        
        split_by = list()
        if (!is.list(split_pred)) {
            # extract time series data from model:
            dat <- NULL
            if ("lm" %in% class(model)) {
                dat <- model$model
            } else if ("lmerMod" %in% class(model)) {
                dat <- model@frame
            }
            if (length(split_pred) == 1 & split_pred[1] %in% c("AR.start", "start.event")) {
                if (split_pred %in% colnames(dat)) {
                  if (is.logical(dat[, split_pred])) {
                    split_by[[split_pred]] = derive_timeseries(dat[, split_pred])
                  } else {
                    split_by[[split_pred]] = dat[, split_pred]
                  }
                } else {
                  split_by[[split_pred]] = derive_timeseries(model)
                }
            } else if (!all(split_pred %in% colnames(dat))) {
                notindata <- paste(split_pred[!split_pred %in% colnames(dat)], collapse = ", ")
                stop(sprintf("split-pred value(s) %s is / are not included as predictor in the model.", notindata))
            } else {
                for (i in split_pred) {
                  split_by[[i]] <- as.vector(dat[, i])
                }
            }
        } else {
            split_by <- split_pred
            # check for missing data
            me <- missing_est(model)
            if (!is.null(me)) {
                split_by <- lapply(split_pred, function(x) {
                  return(x[-me])
                })
            }
        }
    }
    if (n > 1) {
        if (!all(is.na(resid_gam(model, incl_na = TRUE)))) {
            if (is.null(split_pred)) {
                out <- acf_n_plots(resid_gam(model), split_by = split_by, n = n, plot = plot, ...)
                invisible(out)
                # }else if( !is.list(split_pred)){
            } else if (("(AR.start)" %in% colnames(model$model)) || (("AR1.rho" %in% names(model)) && model$AR1.rho > 
                0)) {
                out <- acf_n_plots(resid_gam(model, incl_na = TRUE), split_by = split_by, n = n, plot = plot, 
                  ...)
                invisible(out)
            } else {
                out <- acf_n_plots(resid_gam(model), split_by = split_by, n = n, plot = plot, ...)
                invisible(out)
            }
        }
    } else {
        if (!all(is.na(resid_gam(model, incl_na = TRUE)))) {
            if (is.null(main)) {
                main = ifelse(is.null(split_pred), sprintf("ACF resid_gam(%s)", deparse(substitute(model))), 
                  "ACF Average")
            }
            if (is.null(split_pred)) {
                out <- acf_plot(resid_gam(model), split_by = split_by, plot = plot, main = main, ...)
                invisible(out)
                # }else if( !is.list(split_pred)){
            } else if (("(AR.start)" %in% colnames(model$model)) || (("AR1.rho" %in% names(model)) && model$AR1.rho > 
                0)) {
                out <- acf_plot(resid_gam(model, incl_na = TRUE), split_by = split_by, plot = plot, main = main, 
                  ...)
                invisible(out)
            } else {
                out <- acf_plot(resid_gam(model), split_by = split_by, plot = plot, main = main, ...)
                invisible(out)
            }
        }
    }
}





#' Derive the time series used in the AR1 model.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @import plotfunctions
#' @param model GAMM model that includes an AR1 model.
#' @param AR.start Vector with AR.start information, 
#' necessary for the AR1 model. Optional, defaults to NULL.
#' @return A vector with time series indication based on the AR1 model.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' 
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' simdat <- start_event(simdat, event=c('Subject', 'Trial'))
#' 
#' \dontrun{
#' # Run GAMM model:
#' m1 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat, 
#'     rho=.5, AR.start=simdat$start.event)
#' simdat$Event <- NA
#' simdat[!is.na(simdat$Y),]$Event <- derive_timeseries(m1)
#' acf_resid(m1, split_pred=list(Event=simdat$Event))
#' 
#' # And this works too:
#' simdat$Event <- derive_timeseries(simdat$start.event)
#' acf_resid(m1, split_pred=list(Event=simdat$Event))
#' 
#' # Note that acf_resid automatically makes use of derive_timeseries:
#' acf_resid(m1, split_pred='AR.start')
#' }
#' @family functions for model criticism
derive_timeseries <- function(model, AR.start = NULL) {
    tmpdat <- c()
    na.el <- c()
    if (inherits(model, "lm")) {
        tmpdat <- model$model
        na.el <- missing_est(model)
    } else if (inherits(model, "lmerMod")) {
        tmpdat <- model@frame
        na.el <- missing_est(model)
    } else if (inherits(model, "logical")) {
        tmpdat <- data.frame(val = model, stringsAsFactors = FALSE)
        names(tmpdat) <- "(AR.start)"
        na.el <- which(is.na(model))
    }
    if (!"(AR.start)" %in% names(tmpdat)) {
        if (is.null(AR.start)) {
            stop("Model does not contain an AR1 model. Please specify AR.start information.")
        } else {
            if (length(AR.start) == nrow(tmpdat)) {
                tmpdat[, "(AR.start)"] <- AR.start
            } else if (length(AR.start) == (nrow(tmpdat) + length(na.el))) {
                tmpdat[, "(AR.start)"] <- AR.start[-na.el]
            } else {
                stop(sprintf("AR.start has more values (%f) than observations used in the model (%f).", length(AR.start), 
                  nrow(tmpdat)))
            }
        }
    } else {
        if (!is.null(AR.start)) {
            warning(sprintf("Model %s includes AR1 model, AR.start will be ignored.", deparse(substitute(model))))
        }
    }
    starts <- which(tmpdat[, "(AR.start)"] == TRUE)
    ends <- move_n_point(starts - 1, n = -1)
    ends[length(ends)] <- nrow(tmpdat)
    return(as.factor(rep(1:length(starts), ends - starts + 1)))
}





#' Extract model residuals and remove the autocorrelation accounted for. 
#' 
#' @export
#' @import mgcv
#' @import stats
#' @aliases resid.gam
#' @param model A GAMM model build with \code{\link[mgcv]{gam}} or 
#' \code{\link[mgcv]{bam}}.
#' @param AR_start Optional: vector with logicals, indicating the start of 
#' events. 
#' Default is NULL, because generally the function can retrieve all necessary 
#' information from the model.
#' @param incl_na Whether or not to include missing values (NA)when returning 
#' the residuals. Defaults to FALSE.
#' @param return_all Default is FALSE. Returns a list with normal residuals, 
#' corrected residuals, and the value of rho.
#' @return Corrected residuals.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' 
#' \dontrun{
#' # Add start event column:
#' simdat <- start_event(simdat, event=c('Subject', 'Trial'))
#' head(simdat)
#' # bam model with AR1 model (toy example, not serious model):
#' m1 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat, rho=.5, AR.start=simdat$start.event)
#' # Standard residuals:
#' res1 <- resid(m1)
#' # Corrected residuals:
#' res2 <- resid_gam(m1)
#' 
#' # Result in different ACF's:
#' par(mfrow=c(1,2))
#' acf(res1)
#' acf(res2)
#'
#' # Without AR.start included in the model:
#' m2 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat)
#' acf(resid_gam(m2), plot=F)
#' # Same as resid(m2)!
#' acf(resid(m2), plot=F)
#'
#' ### MISSING VALUES ###
#' # Note that corrected residuals cannot be calculated for the last 
#' # point of each time series. These missing values are by default
#' # excluded.
#'
#' # Therefore, this will result in an error...
#' simdat$res <- resid_gam(m1)
#' # ... and this will give an error too:
#' simdat$res <- NA
#' simdat[!is.na(simdat$Y),] <- resid_gam(m1)
#' # ... but this works:
#' simdat$res <- resid_gam(m1, incl_na=TRUE)
#' 
#' # The parameter incl_na will NOT add missing values
#' # for missing values in the *data*. 
#' # Example:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' # Without AR.start included in the model:
#' m2 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat)
#' # This works:
#' acf(resid_gam(m2))
#' # ...but this results in error, although no AR1 model specified:
#' simdat$res <- resid_gam(m2)
#' # ... for this type of missing data, this does not solve the problem:
#' simdat$res <- resid_gam(m2, incl_na=TRUE)
#' # instead try this:
#' simdat$res <- NA
#' simdat[-missing_est(m2),]$res <- resid_gam(m2)
#' 
#' # With AR.start included in the model:
#' m1 <- bam(Y ~ Group + te(Time, Trial, by=Group), 
#'    data=simdat, rho=.5, AR.start=simdat$start.event)
#' # This works (incl_na=TRUE):
#' simdat$res <- NA
#' simdat[-missing_est(m2),]$res <- resid_gam(m2, incl_na=TRUE)
#'
#' }
#' @seealso \code{\link[stats]{resid}}, \code{\link{missing_est}}
#' @family functions for model criticism
resid_gam <- function(model, AR_start = NULL, incl_na = FALSE, return_all = FALSE) {
    
    # help function
    next_point <- function(x) {
        x <- as.vector(x)
        
        if (length(x) > 1) 
            return(c(x[2:length(x)], NA)) else if (length(x) == 1) 
            return(NA) else return(NULL)
    }
    
    # extract time series data from model:
    tmpdat <- NULL
    n <- 1
    rho <- 0
    if ("lm" %in% class(model)) {
        tmpdat <- model$model
        # check AR_start:
        if (is.null(tmpdat$"(AR.start)")) {
            tmpdat$"(AR.start)" <- rep(FALSE, nrow(tmpdat))
            tmpdat[1, ]$"(AR.start)" <- TRUE
            
            if (!is.null(model$AR1.rho)) {
                if (model$AR1.rho > 0) {
                  if (is.null(AR_start)) {
                    warning("No values for AR_start found in model specification. Please provide values for AR_start as argument to this function if you have run the model with an older version of mgcv (< 1.7.28).")
                  } else {
                    warning(sprintf("Values for argument AR_start may not be specified, although an AR1 model rho was included in model %s (rho = %f).", 
                      deparse(substitute(model)), model$AR1.rho))
                    tmpdat$"(AR.start)" <- AR_start
                  }
                }
            }
        }
        
        # additional check of AR.start:
        if ((!TRUE %in% unique(tmpdat$"(AR.start)")) | (!FALSE %in% unique(tmpdat$"(AR.start)"))) {
            stop("No event starts found in AR_start argument.\nImportant: check the values of the AR_start argument, add at least one event start (value TRUE), and run the model again.")
        }
        
        n <- which(tmpdat$"(AR.start)")
        if (is.null(model$AR1.rho)) {
            # warning('No rho specified in model. Assumed rho to equal 0.')
            model[["AR1.rho"]] <- 0
            rho <- 0
        } else {
            rho <- model$AR1.rho
        }
    } else if ("lmerMod" %in% class(model)) {
        tmpdat <- model@frame
        rho <- 0
    } else {
        stop(sprintf("Function does not work for models of class %s.", class(model)[1]))
    }
    
    
    if (nrow(tmpdat) == length(resid(model))) {
        tmpdat$RES <- resid(model)
    } else {
        tmpdat$RES <- NA
        if (nrow(tmpdat) == length(resid(model))) {
            tmpdat$RES <- resid(model)
        } else {
            tmpdat[!(1:nrow(tmpdat)) %in% model$na.action, ]$RES <- resid(model)
        }
    }
    
    tmpdat$RES_next <- next_point(tmpdat$RES)
    if (length(n[(n - 1) > 0]) > 0) {
        tmpdat[n[(n - 1) > 0], ]$RES_next <- rep(NA, length(n[(n - 1) > 0]))
    }
    
    res <- tmpdat$RES_next - rho * tmpdat$RES
    if (return_all) {
        return(list(res = tmpdat$RES, norm_res = res, AR1_rho = rho))
    } else {
        if (rho == 0) {
            res <- tmpdat$RES
        }
        if (incl_na) {
            return(res)
        } else {
            return(res[!is.na(res)])
        }
    }
}





#' Determine the starting point for each time series.
#' 
#' @export
#' @import mgcv
#' @import stats
#' @param data A data frame.
#' @param column Test string, name of the column that describes the order 
#' withing the time series. Default is 'Time'.
#' @param event A text string or vector indicating the columns that define the 
#' unique time series. Default is 'Event'.
#' @param label The name of the new column with the start point of each time 
#' series. Default is 'start.event'.
#' @param label.event In case \code{event} is not a single column, providing a 
#' text string will add a column with this name that defines unique time 
#' series. Default is NULL (no new column for time series is created).
#' @param order Logical: whether or not to order each time series. 
#' Default is TRUE, maybe set to FALSE with large data frames that are already ordered.
#' @param newcode Logical: whether or not to use the new (and hopefully faster) code.
#' @section Note:
#' When working with large data frames, it may be worth installing the package 
#' \code{data.table}. Although not required for the package, the function 
#' \code{start_event} will check if \code{data.table} is available and will 
#' use it's much faster function \code{rbindlist}. This speeds up the function 
#' \code{start_event}. Run the command 
#' \code{install.packages('data.table', repos='http://cran.us.r-project.org')} 
#' in the command window for installing the package \code{data.table}.
#' @return Data frame.
#' @author Jacolien van Rij
#' @examples 
#' data(simdat)
#' head(simdat)
#' simdat <- simdat[sample(1:nrow(simdat)),]
#' simdat$Condition <- relevel(factor(simdat$Condition), ref="0")
#' contrasts(simdat$Condition) <- "contr.poly"
#' contrasts(simdat$Condition)
#' test <- start_event(simdat, event=c('Subject', 'Trial'), label.event='Event') 
#' contrasts(test$Condition)
#' head(test)
#' test <- start_event(simdat, event="Subject")
#' test <- start_event(simdat, event=c('Subject', 'Trial'))
#' @family functions for model criticism
start_event <- function(data, column = "Time", event = "Event", label = "start.event", label.event = NULL, 
    order = TRUE, newcode = TRUE) {
    if (is.null(label)) {
        stop("No output column specified in argument 'label'.")
    }
    if (!column %in% names(data)) {
        stop(sprintf("No column '%s' in data frame %s.", column, deparse(substitute(data))))
    }
    if (!all(event %in% names(data))) {
        el <- paste(which(event[!event %in% names(data)]), collapse = ",")
        stop(sprintf("Column name(s) '%s' not found in data frame %s.", el, deparse(substitute(data))))
    }
    if (length(column) > 1) {
        warning(sprintf("Argument column has %d elements. Only first element is used.", length(column)))
        column <- column[1]
    }
    if (label %in% names(data)) {
        warning(sprintf("Column %s already exists, will be overwritten.", label))
    }
    if (!is.null(label.event)) {
        if (label.event %in% names(data)) {
            warning(sprintf("Column %s already exists, will be overwritten.", label.event))
        }
    }
    

    if(newcode){
        if (!is.null(label.event) & length(event) > 1) {
            data[, label.event] <- interaction(data[, event], drop=TRUE)  
            if(order){
                data <- data[order(data[, label.event], data[, column]),]
            }
            check <- tapply(data[,column], list(data[, label.event]), 
                function(x){return(min(x, na.rm=TRUE))})   
            data[, label] <- data[, column] == check[data[,label.event]]     
        } else if (length(event) > 1) {
            dummy.event.label = paste(c("tmpcolumn_", sample(c(letters, as.character(0:9), "_", "."), 
                size=100, replace=TRUE)), collapse="")
            data[, dummy.event.label] <- interaction(data[, event], drop=TRUE)
            if(order){
                data <- data[order(data[, dummy.event.label], data[, column]),]
            }  
            check <- tapply(data[,column], list(data[, dummy.event.label]), 
                function(x){return(min(x, na.rm=TRUE))})   
            data[, label] <- data[, column] == check[data[,dummy.event.label]] 
            data[, dummy.event.label] <- NULL  
        } else {
            if(order){
                data <- data[order(data[, event], data[, column]),]
            }
            check <- tapply(data[,column], list(data[, event]), 
                function(x){return(min(x, na.rm=TRUE))})   
            data[, label] <- data[, column] == check[data[,event]] 
        }
        return(data)
    } else {
    ## OLD CODE:
        # store reference levels:
        ref <- refLevels(data)
        tmp <- NULL
        if (!is.null(label.event) & length(event) > 1) {
            data[, label.event] <- interaction(data[, event])
            tmp <- split(data, f = list(data[, label.event]), drop = TRUE)
        } else if (length(event) > 1) {
            tmp <- split(data, f = as.list(data[, event]), drop = TRUE)
        } else {
            tmp <- split(data, f = list(data[, event]), drop = TRUE)
        }
        if (order) {
            tmp <- lapply(tmp, function(x) {
                min.x <- min(x[, column])
                x[, label] <- x[, column] == min.x
                x <- x[order(x[, column]), ]
                return(x)
            })
        } else {
            tmp <- lapply(tmp, function(x) {
                min.x <- min(x[, column])
                x[, label] <- x[, column] == min.x
                return(x)
            })
        }
        if (requireNamespace("data.table", quietly = TRUE)) {
            tmp <- as.data.frame(data.table::rbindlist(tmp), stringsAsFactors = FALSE)
        } else {
            if(getOption("itsadug_print")){
                message("Package data.table is not installed. This package may speed up the function start_event.")
            }
            tmp <- do.call("rbind", tmp)
        }
        row.names(tmp) <- NULL
        # put same reference levels back:
        for(i in names(ref)){
            tmp[,i] <- relevel(as.factor(tmp[,i]), ref=ref[[i]][['ref']])
            contrasts(tmp[,i]) <- ref[[i]][['contr']]
        }
        return(tmp)
    }
}





#' Extract the Lag 1 value from the ACF of the residuals of a gam, bam, lm, 
#' lmer model, ...
#' 
#' @description Wrapper around \code{\link{acf_plot}} for regression models.
#' @export
#' @import mgcv
#' @import stats
#' @param model A regression model generated by \code{lm}, \code{glm},
#' \code{lmer}, \code{glmer}, \code{\link[mgcv]{gam}}, 
#' or \code{\link[mgcv]{bam}}.
#' (See examples for how to avoid errors due to missing values.)
#' @param plot Logical: whether or not to produce plot. Default is TRUE.
#' @param lag Numeric value, indicating the lag. Default is 2. 
#' @param main Text string, title of ACF plot.
#' @param ... Other arguments for plotting the acf, see 
#' \code{\link[stats]{acf}}.
#' @return The autocorrelation value of data points with the data points 
#' at lag \code{lag}.
#' @author Jacolien van Rij
#' @seealso Use \code{\link[stats]{acf}} for the original ACF function, 
#' and \code{\link{acf_plot}}, or \code{\link{acf_resid}}.
#' @examples
#' data(simdat)
#'
#' # add missing values to simdat:
#' simdat[sample(nrow(simdat), 15),]$Y <- NA
#' 
#' \dontrun{
#' # Run GAMM model:
#' m1 <- bam(Y ~ te(Time, Trial)+s(Subject, bs='re'), data=simdat)
#'
#' # No plotting:
#' start_value_rho(m1)
#' # With plot:
#' rhom1 <- start_value_rho(m1, plot=TRUE)
#' 
#' }
#' # see the vignette for examples:
#' vignette('acf', package='itsadug')
#' @family functions for model criticism
#' 
start_value_rho <- function(model, plot = FALSE, lag = 2, main = NULL, ...) {
    rval <- acf(resid(model), plot = plot, main = ifelse(is.null(main), sprintf("Series resid(%s)", deparse(substitute(model))), 
        main), ...)$acf[lag]
    if (plot) {
        par <- list(...)
        f <- par()$lwd
        if ("lwd" %in% names(par)) {
            f <- par[["lwd"]]
        }
        points(lag - 1, rval, col = "red", type = "h", lwd = 1.5 * f)
        abline(h = rval, col = "red", lty = 3)
    }
    return(rval)
}

Try the itsadug package in your browser

Any scripts or data that you put into this service are public.

itsadug documentation built on June 17, 2022, 5:05 p.m.