R/util.R

Defines functions timeBins print_summary summary_data refLevels missing_est find_difference res_df observations modeledf diff_terms corfit convertNonAlphanumeric

Documented in convertNonAlphanumeric corfit diff_terms find_difference missing_est modeledf observations print_summary refLevels res_df summary_data timeBins

#' Prepare string for regular expressions 
#' (backslash for all non-letter and non-digit characters)
#' 
#' @export
#' @param text A text string (smooth term label) that needs to be converted 
#' to a regular expression. 
#' @return A regular expression string.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' # Model for illustrating coefficients:
#' m0 <- bam(Y ~ s(Time) + s(Subject, bs='re') 
#' + s(Time, Subject, bs='re'), data=simdat)
#' 
#' # get all coefficients:
#' coef(m0)
#' # to get only the Subject intercepts:
#' coef(m0)[grepl(convertNonAlphanumeric('s(Subject)'), names(coef(m0)))]
#' # to get only the Subject slopes:
#' coef(m0)[grepl(convertNonAlphanumeric('s(Time,Subject)'), names(coef(m0)))]
#'
#' @family Utility functions
convertNonAlphanumeric <- function(text) {
    return(gsub("([^a-zA-Z0-9])", "\\\\\\1", as.character(text)))
}





#' Calculate the correlation between the fitted model and data.
#' 
#' @export
#' @import mgcv
#' @param model A fitted regression model (using gam, or bam).
#' @return Numeric value: correlation between fitted model and data. 
#' @examples 
#' data(simdat)
#'
#' # Fit simple GAM model:
#' gam1 <- bam(Y ~ s(Time), data=simdat, discrete=TRUE)
#' corfit(gam1)
#' 
#' @family Utility functions
corfit = function(model) {
    if ("gamm" %in% class(model)) {
        stop(sprintf("Apply the function on the gam part of the model: modeledf(%s$gam)", deparse(substitute(model))))
    } else if ((!"gam" %in% class(model))) {
        stop("Model is not a gam object. Currently this function only works for models build with bam(), gam(), or gamm().")
    }
    return(cor(model$model[, 1], fitted(model)))
}





#' Compare the formulas of two models and return the difference(s). 
#' 
#' @export
#' @import mgcv
#' @param model1 A fitted regression model (using lm, glm, gam, or bam).
#' @param model2 A fitted regression model (using lm, glm, gam, or bam).
#' @return A list with model terms that are not shared by both models.
#' @author Jacolien van Rij
#' @examples 
#' data(simdat)
#'
#' # Fit simple GAM model:
#' gam1 <- bam(Y ~ s(Time), data=simdat)
#' gam2 <- bam(Y ~ Group+s(Time), data=simdat)
#' diff_terms(gam1, gam2)
#'
#' @family Utility functions
diff_terms <- function(model1, model2) {
    fa <- c()
    fb <- c()
    get_formula <- function(x) {
        if ("gam" %in% class(x)) {
            return(attr(terms(x$formula), "term.labels"))
        } else {
            stop(sprintf("This function does not work for model of class %s.", class(x)))
        }
    }
    fa <- get_formula(model1)
    fb <- get_formula(model2)
    d1 <- fa[!fa %in% fb]
    d2 <- fb[!fb %in% fa]
    out <- list()
    out[[deparse(substitute(model1))]] <- d1
    out[[deparse(substitute(model2))]] <- d2
    return(out)
}





#' Retrieve the degrees of freedom specified in the model.
#' 
#' @export
#' @import mgcv
#' @param model A fitted regression model (using gam, or bam).
#' @return Numeric value: degrees of freedom specified in the model.
#' @examples 
#' data(simdat)
#' 
#' \dontrun{
#' # models take somewhat longer time to run:
#' 
#' # Fit simple GAM model:
#' gam1 <- bam(Y ~ s(Time), data=simdat, discrete=TRUE)
#' modeledf(gam1)
#' gam2 <- bam(Y ~ s(Time)+s(Time, Subject, bs='fs', m=1), 
#'     data=simdat, discrete=TRUE)
#' modeledf(gam2)
#' gam3 <- bam(Y ~ Subject+s(Time, by=Subject), 
#'     data=simdat, discrete=TRUE)
#' modeledf(gam3)
#' gam4 <- bam(Y ~ Group+s(Time)+s(Time, Subject, bs='fs', m=1), 
#'     data=simdat, discrete=TRUE)
#' modeledf(gam4) 
#' gam5 <- bam(Y ~ Group+s(Time, by=Group)+s(Time, Subject, bs='fs', m=1), 
#'     data=simdat, discrete=TRUE)
#' modeledf(gam5) 
#' 
#' # Fit a gamm:
#' gam6 <- gamm(Y ~ Group+s(Time), random=list(Subject=~1) data=simdat, discrete=TRUE)
#' # this produces an error...
#' modeledf(gam6)
#' # ... but this works:
#' modeledf(gam6$gam)
#' }
#' 
#' @family Utility functions
modeledf = function(model) {
    if ("gamm" %in% class(model)) {
        stop(sprintf("Apply the function on the gam part of the model: modeledf(%s$gam)", deparse(substitute(model))))
    } else if ((!"gam" %in% class(model))) {
        stop("Model is not a gam object. Currently this function only works for models build with bam(), gam(), or gamm().")
    }
    null.space.dim <- 0
    if (length(model$smooth) > 0) {
        null.space.dim <- sum(sapply(model$smooth, FUN = function(x) {
            x$null.space.dim
        }, USE.NAMES = FALSE))
    }
    return(length(model$sp) + model$nsdf + null.space.dim)
}
#' Number of observations in the model.
#' 
#' @export
#' @import mgcv
#' @param model A fitted regression model (using gam, bam, (g)lm, (g)lmer).
#' @return Numeric value: number of observations that are considered by the 
#' model. 
#' @examples 
#' data(simdat)
#' # simulate some missing data:
#' simdat[sample(1:nrow(simdat), size=15),]$Y <- NA
#' simdat[sample(1:nrow(simdat), size=7),]$Group <- NA
#' 
#' # Fit simple GAM models:
#' gam1 <- bam(Y ~ s(Time), data=simdat, discrete=TRUE)
#' gam2 <- bam(Y ~ Group + s(Time, by=Group), data=simdat, discrete=TRUE)
#' 
#' # number of data points in data frame:
#' nrow(simdat)
#' 
#' # observations model gam1:
#' observations(gam1)
#' # observations model gam2:
#' observations(gam2)
#' 
#' @family Utility functions
observations = function(model) {
    if ("gamm" %in% class(model)) {
        stop(sprintf("Apply the function on the gam part of the model: observations(%s$gam)", deparse(substitute(model))))
    } else if ("lme" %in% class(model)) {
        return(nrow(model@frame))
    } else if ("lm" %in% class(model)) {
        return(nrow(model$model))
    } else if (("gam" %in% class(model))) {
        return(nrow(model$model))
    } else {
        stop("Currently this function only works for models build with bam(), gam(), glm(), lm(), lmer(), and glmer().")
    }
}
#' Retrieve the residual degrees of freedom from the model.
#' 
#' @export
#' @import mgcv
#' @param model A fitted regression model (using gam, or bam).
#' @return Numeric value: residual degrees of freedom from the model.
#' @examples 
#' data(simdat)
#'
#' # Fit simple GAM model:
#' gam1 <- bam(Y ~ s(Time), data=simdat, discrete=TRUE)
#' res_df(gam1)
#' # ... which is the same as:
#' 
#' modeledf(gam1)
#' 
#' @family Utility functions
res_df = function(model) {
    if ("gamm" %in% class(model)) {
        stop(sprintf("Apply the function on the gam part of the model: res_df(%s$gam)", deparse(substitute(model))))
    } else if ((!"gam" %in% class(model))) {
        stop("Model is not a gam object. Currently this function only works for models build with bam(), gam(), or gamm().")
    }
    return(nrow(model$model) - modeledf(model))
}





#' Return the regions in which the smooth is significantly different from zero.
#' 
#' @export
#' @import grDevices
#' @import graphics
#' @param mean A vector with smooth predictions.
#' @param se A vector with the standard error on the smooth predictions.
#' @param xVals Optional vector with x values for the smooth. 
#' When \code{xVals} is provided, the regions are returned in terms of x-
#' values, otherwise as indices.
#' @param f A number to multiply the \code{se} with, to convert the \code{se} 
#' into confidence intervals. Use 1.96 for 95\% CI and 2.58 for 99\%CI.
#' @param as.vector Logical: whether or not to return the data points as 
#' vector, or not. Default is FALSE, and a list with start and end points will
#'  be returned.
#' @return The function returns a list with start points of each region 
#' (\code{start}) and end points of each region (\code{end}). The logical 
#' \code{xVals} indicates whether the returned values are on the x-scale 
#' (TRUE) or indices (FALSE).
#' @examples
#' data(simdat)
#' 
#' # Use aggregate to calculate mean and standard deviation per timestamp:
#' avg <- aggregate(simdat$Y, by=list(Time=simdat$Time),
#'     function(x){c(mean=mean(x), sd=sd(x))})
#' head(avg)
#' # Note that column x has two values in each row:
#' head(avg$x)
#' head(avg$x[,1])
#' 
#' # Plot line and standard deviation:
#' emptyPlot(range(avg$Time), c(-20,20), h0=0)
#' plot_error(avg$Time, avg$x[,'mean'], avg$x[,'sd'], 
#'    shade=TRUE, lty=3, lwd=3)
#'
#' # Show difference with 0:
#' x <- find_difference(avg$x[,'mean'], avg$x[,'sd'], xVals=avg$Time)
#' # Add arrows:
#' abline(v=c(x$start, x$end), lty=3, col='red')
#' arrows(x0=x$start, x1=x$end, y0=-5, y1=-5, code=3, length=.1, col='red')
#' @author Jacolien van Rij
#'
#' @family Utility functions
find_difference <- function(mean, se, xVals = NULL, f = 1, as.vector = FALSE) {
    if (length(mean) != length(se)) {
        stop("The vectors mean and se are not equal in length.")
    } else {
        ub <- mean + f * se
        lb <- mean - f * se
        
        n <- which(!(ub >= 0 & lb <= 0))
        if (as.vector) {
            if (length(n) == 0) {
                return(rep(FALSE, length(mean)))
            } else {
                out <- rep(FALSE, length(mean))
                out[n] <- TRUE
                return(out)
            }
        } else {
            if (length(n) == 0) {
                return(NULL)
            } else {
                n_prev <- c(NA, n[1:(length(n) - 1)])
                n_next <- c(n[2:length(n)], NA)
                if (!is.null(xVals) & (length(xVals) == length(mean))) {
                  return(list(start = xVals[n[which(is.na(n - n_prev) | (n - n_prev) > 1)]], end = xVals[n[which(is.na(n_next - 
                    n) | (n_next - n) > 1)]], xVals = TRUE))
                } else {
                  return(list(start = n[which(is.na(n - n_prev) | (n - n_prev) > 1)], end = n[which(is.na(n_next - 
                    n) | (n_next - n) > 1)], xVals = FALSE))
                }
                
            }
        }
    }
}





#' Return indices of data that were not fitted by the model.
#' 
#' @export
#' @param model A fitted regression model (using lm, glm, gam, or bam).
#' @return The indices of the data that were not fitted by the model.
#' @author Jacolien van Rij
#' @examples 
#' data(simdat)
#'
#' # Add missing values:
#' set.seed(123)
#' simdat[sample(nrow(simdat), size=20),]$Y <- NA
#' # Fit simple linear model:
#' lm1 <- lm(Y ~ Time, data=simdat)
#' na.el <- missing_est(lm1)
#' length(na.el)
#'
#' @family Utility functions
missing_est <- function(model) {
    if ("lm" %in% class(model)) {
        el <- unique(model$na.action)
        if (length(el) > 0) {
            return(sort(el))
        } else {
            return(el)
        }
    } else {
        stop("This method is currently only implemented for lm, glm, gam, and bam models.")
    }
}





#' Return a list with reference levels for each factor.
#' 
#' @export
#' @description Function for retrieving all reference levels.
#' @param data Data
#' @return Named list with reference levels for each factor.
#' @author Jacolien van Rij
#' @family Utility functions
refLevels <- function(data) {
    var.factor <- names(data)[sapply(data, function(x){ inherits(x, 'factor')})]
    if(length(var.factor) > 0){
    	return( lapply(data[ ,var.factor], function(x){
    		return(list(ref=levels(x)[1], contr=contrasts(x)))
    	}) )
    }else{
    	return(list())
    }
}





#' Print a descriptive summary of a data frame.
#' 
#' @export
#' @description The function prints a summary of the data. 
#' Similar to the function \code{\link[utils]{str}}, but easier readable.
#' @param data A data frame.
#' @param print Logical: whether or not to print the summary.
#' @param n Number: maximum number of values being mentioned in the summary.
#' If NULL all values are being mentioned. Defaults to 10.
#' @return Optionally returns a named list with info.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' summary_data(simdat)
#' @family Utility functions
summary_data <- function(data, print = TRUE, n = 10) {
    
    labelColumns <- function(x, data) {
        out <- NULL
        cn <- ifelse(is.numeric(x), colnames(data)[x], x)
        cl <- class(data[, cn])
        if (inherits(data[, cn], "factor")) {
            vals <- sort(unique(as.character(data[, x])))
            n.cur <- length(vals) + 1
            if (!is.null(n)) {
                n.cur <- n
            }
            if (length(vals) > n.cur) {
                out <- sprintf("factor with %d values; set to the value(s): %s, ...", length(vals), paste(vals[1:n.cur], 
                  collapse = ", "))
            } else {
                out <- sprintf("factor; set to the value(s): %s.", paste(vals, collapse = ", "))
            }
        } else if (inherits(data[, cn], "numeric")) {
            if (length(unique(data[, x])) > 2) {
                out <- sprintf("numeric predictor; with %d values ranging from %f to %f.", length(unique(data[, 
                  x])), min(data[, x], na.rm = TRUE), max(data[, x], na.rm = TRUE))
            } else {
                out <- sprintf("numeric predictor; set to the value(s): %s.", paste(unique(data[, x]), collapse = ", "))
            }
        } else if (inherits(data[, cn], "matrix")) {
            if (length(unique(data[, x])) > 2) {
                out <- sprintf("a matrix predictor; with %d values ranging from %f to %f.", length(unique(data[, 
                  x])), min(data[, x], na.rm = TRUE), max(data[, x], na.rm = TRUE))
            } else {
                out <- sprintf("matrix predictor; set to the value(s): %s.", paste(unique(data[, x]), collapse = ", "))
            }
        } else {
            vals <- sort(unique(data[, x]))
            n.cur <- length(vals) + 1
            if (!is.null(n)) {
                n.cur <- n
            }
            if (length(vals) > n.cur) {
                out <- sprintf("%s vector with %d values; set to the value(s): %s, ...", class(data[, cn])[1], 
                  length(vals), paste(vals[1:n.cur], collapse = ", "))
            } else {
                out <- sprintf("%s vector; set to the value(s): %s.", class(data[, cn])[1], paste(vals, collapse = ", "))
            }
        }
        return(out)
    }
    mysummary <- sapply(colnames(data), function(x) {
        labelColumns(x, data)
    })
    if (print) {
        print_summary(mysummary)
    }
    invisible(mysummary)
}
#' Print a named list of strings, output from \code{\link{summary_data}}.
#' 
#' @export
#' @param sumlist Named list, output of \code{\link{summary_data}}.
#' @param title Optional, text string that will be printed as title.
#' @author Jacolien van Rij
#' @family Utility functions
print_summary <- function(sumlist, title = NULL) {
    if (is.null(title)) {
        cat("Summary:\n")
    } else {
        cat(title, "\n")
    }
    for (x in names(sumlist)) {
        cat("\t*", x, ":", sumlist[[x]], "\n")
    }
}





#' Label timestamps as timebins of a given binsize.
#' 
#' @export
#' @description Function for calculating timebins.
#' @param x Numerical vector with timestamp information.
#' @param binsize Size of the timebin, measured in the same units (often ms) 
#' as \code{x}.
#' @param pos Numerical value that determines the label of the binsize 
#' as proportion of the binsize. A value of 0 will provide the minimum 
#' timestamp within the bin as label, a value of 1 will provide the maximum 
#' value within the bin as label. Defaults to 0.5, the center of the bin.
#' @return Anumerical vector of the same size as \code{x} with timebin 
#' information.
#' @author Jacolien van Rij
#' @examples
#' data(simdat)
#' # grouping Time values in bins:
#' simdat$Timebin <- timeBins(simdat$Time, 200)
#' head(simdat)
#' 
#' # different labels:
#' simdat$Timebin2 <- timeBins(simdat$Time, 200, pos=0)
#' head(simdat)
#' @family Utility functions
timeBins <- function(x, binsize, pos = 0.5) {
    return((floor(x/binsize) + pos) * binsize)
}

Try the itsadug package in your browser

Any scripts or data that you put into this service are public.

itsadug documentation built on June 17, 2022, 5:05 p.m.