Nothing
## TODO: Add comment
#
## Author: David Carslaw
## useful utility functions
## with some updates and modification by Karl Ropkins
###############################################################################
startYear <- function(dat) as.numeric(format(min(dat[order(dat)]), "%Y"))
endYear <- function(dat) as.numeric(format(max(dat[order(dat)]), "%Y"))
startMonth <- function(dat) as.numeric(format(min(dat[order(dat)]), "%m"))
endMonth <- function(dat) as.numeric(format(max(dat[order(dat)]), "%m"))
## these are pre-defined type that need a field "date"; used by cutData
dateTypes <- c(
"year", "hour", "month", "season", "weekday", "weekend",
"monthyear", "gmtbst", "bstgmt", "dst", "daylight", "week",
"seasonyear", "yearseason"
)
## sets up how openair graphics look by default and resets on exit
setGraphics <- function(fontsize = 5) {
current.strip <- trellis.par.get("strip.background")
trellis.par.set(fontsize = list(text = fontsize))
## reset graphic parameters
font.orig <- trellis.par.get("fontsize")$text
on.exit(trellis.par.set(
fontsize = list(text = font.orig)
))
}
# function to test of a suggested package is available and warn if not
try_require <- function(package, fun) {
if (requireNamespace(package, quietly = TRUE)) {
library(package, character.only = TRUE)
return(invisible())
}
stop(
"Package `", package, "` required for `", fun, "`.\n",
"Please install and try again.",
call. = FALSE
)
}
###############################################################################
## function to find averaging period of data, returns "xx sec"
## for use in filling in gaps in time series data
## it finds the table of values of time gaps and picks the biggest
## can't think of better way unless user specifies what the time interval is meant to be
find.time.interval <- function(dates) {
## could have several sites, dates may be unordered
## find the most common time gap in all the data
dates <- unique(dates) ## make sure they are unique
# work out the most common time gap of unique, ordered dates
id <- which.max(table(diff(as.numeric(unique(dates[order(dates)])))))
seconds <- as.numeric(names(id))
if ("POSIXt" %in% class(dates)) seconds <- paste(seconds, "sec")
if (class(dates)[1] == "Date") {
seconds <- seconds * 3600 * 24
seconds <- paste(seconds, "sec")
}
seconds
}
date.pad2 <- function(mydata, type = NULL, interval = "month") {
# assume by the time we get here the data have been split into types
# This means we just need to pad out the missing types based on first
# line.
start.date <- min(mydata$date, na.rm = TRUE)
end.date <- max(mydata$date, na.rm = TRUE)
# interval is in seconds, so convert to days if Date class and not POSIXct
if (class(mydata$date)[1] == "Date")
interval <- paste(as.numeric(strsplit(interval, " ")[[1]][1]) / 3600 / 24, "days")
all.dates <- data.frame(date = seq(start.date, end.date, by = interval))
mydata <- mydata %>% full_join(all.dates, by = "date")
# add in missing types if gaps are made
if (!is.null(type)) {
mydata[type] <- mydata[1, type]
}
# make sure order is correct
mydata <- arrange(mydata, date)
return(mydata)
}
## #################################################################
# Function to pad out missing time data
# assumes data have already been split by type, so just take first
# tries to work out time interval of input based on most common gap
# can print assumed gap to screen
date.pad <- function(mydata, type = NULL, print.int = FALSE) {
# if one line, just return
if (nrow(mydata) < 2) return(mydata)
## time zone of data
TZ <- attr(mydata$date, "tzone")
if (is.null(TZ)) TZ <- "GMT" ## as it is on Windows for BST
## function to fill missing data gaps
## assume no missing data to begin with
## pad out missing data
start.date <- min(mydata$date, na.rm = TRUE)
end.date <- max(mydata$date, na.rm = TRUE)
## interval in seconds
interval <- find.time.interval(mydata$date)
## equivalent number of days, used to refine interval for month/year
days <- as.numeric(strsplit(interval, split = " ")[[1]][1]) /
24 / 3600
## find time interval of data
if (class(mydata$date)[1] == "Date") {
interval <- paste(days, "day")
} else {
## this will be in seconds
interval <- find.time.interval(mydata$date)
}
## better interval, most common interval in a year
if (days == 31) interval <- "month"
if (days %in% c(365, 366)) interval <- "year"
## only pad if there are missing data
if (length(unique(diff(mydata$date))) != 1L) {
all.dates <- data.frame(date = seq(start.date, end.date, by = interval))
mydata <- mydata %>% full_join(all.dates, by = "date")
# add missing types - if type is present
if (!is.null(type)) {
mydata[type] <- mydata[1, type]
}
}
## return the same TZ that we started with
attr(mydata$date, "tzone") <- TZ
if (print.int) message("Input data time interval assumed is ", interval)
# make sure date-sorted
mydata <- arrange(mydata, date)
mydata
}
#############################################################################################
## unitility function to convert decimal date to POSIXct
decimalDate <- function(x, date = "date") {
thedata <- x
x <- x[, date]
x.year <- floor(x)
## fraction of the year
x.frac <- x - x.year
## number of seconds in each year
x.sec.yr <- unclass(ISOdate(x.year + 1, 1, 1, 0, 0, 0)) - unclass(ISOdate(x.year, 1, 1, 0, 0, 0))
## now get the actual time
x.actual <- ISOdate(x.year, 1, 1, 0, 0, 0) + x.frac * x.sec.yr
x.actual <- as.POSIXct(trunc(x.actual, "hours"), "GMT")
thedata$date <- x.actual
thedata
}
#' Calculate rollingMean values
#'
#' Calculate rollingMean values taking account of data capture thresholds
#'
#' This is a utility function mostly designed to calculate rolling
#' mean statistics relevant to some pollutant limits e.g. 8 hour
#' rolling means for ozone and 24 hour rolling means for
#' PM10. However, the function has a more general use in helping to
#' display rolling mean values in flexible ways e.g. with the rolling
#' window width left, right or centre aligned.
#'
#' The function will try and fill in missing time gaps to get a full
#' time sequence but return a data frame with the same number of rows
#' supplied.
#'
#' @param mydata A data frame containing a \code{date}
#' field. \code{mydata} must contain a \code{date} field in
#' \code{Date} or \code{POSIXct} format. The input time series must
#' be regular e.g. hourly, daily.
#' @param pollutant The name of a pollutant e.g. \code{pollutant = "o3"}.
#' @param width The averaging period (rolling window width) to use
#' e.g. \code{width = 8} will generate 8-hour rolling mean values
#' when hourly data are analysed.
#' @param new.name The name given to the new rollingMean variable. If
#' not supplied it will create a name based on the name of the
#' pollutant and the averaging period used.
#' @param data.thresh The data capture threshold in %. No values are
#' calculated if data capture over the period of interest is less
#' than this value. For example, with \code{width = 8} and
#' \code{data.thresh = 75} at least 6 hours are required to calculate
#' the mean, else \code{NA} is returned.
#' @param align specifies how the moving window should be
#' aligned. \code{"right"} means that the previous \code{hours}
#' (including the current) are averaged. This seems to be the default
#' for UK air quality rolling mean statistics. \code{"left"} means
#' that the forward \code{hours} are averaged, and \code{"centre"} or
#' \code{"center"}, which is the default.
#' @param ... other arguments, currently unused.
#' @export
#' @author David Carslaw
#' @examples
#'
#' ## rolling 8-hour mean for ozone
#' mydata <- rollingMean(mydata, pollutant = "o3", width = 8, new.name =
#' "rollingo3", data.thresh = 75, align = "right")
#'
#'
rollingMean <- function(mydata, pollutant = "o3", width = 8, new.name = "rolling",
data.thresh = 75, align = "centre", ...) {
## function to calculate rolling means
## uses C++ code
## get rid of R check annoyances
site <- NULL
if (!align %in% c("left", "right", "centre", "center")) stop("align should be one of 'right', 'left', 'centre' or 'center'.")
if (missing(new.name)) new.name <- paste("rolling", width, pollutant, sep = "")
if (data.thresh < 0 | data.thresh > 100) stop("Data threshold must be between 0 and 100.")
calc.rolling <- function(mydata, ...) {
## data needs to be numeric
if (!is.numeric(mydata[[pollutant]])) {
warning("Data are not numeric.")
return(mydata)
}
## need to know whether dates added
dates <- mydata$date
## pad missing hours
mydata <- date.pad(mydata)
## make sure function is not called with window width longer than data
if (width > nrow(mydata)) return(mydata)
mydata[[new.name]] <- .Call(
"rollMean", mydata[[pollutant]],
width, data.thresh, align,
PACKAGE = "openair"
)
if (length(dates) != nrow(mydata)) {
## return what was put in
## avoids adding missing data e.g. for factors
mydata <- mydata[mydata$date %in% dates, ]
}
mydata
}
## split if several sites
if ("site" %in% names(mydata)) { ## split by site
mydata <- group_by(mydata, site) %>%
do(calc.rolling(., ...))
mydata
} else {
mydata <- calc.rolling(mydata, ...)
mydata
}
}
convert.date <- function(mydata, format = "%d/%m/%Y %H:%M") {
mydata$date <- as.POSIXct(strptime(mydata$date, format = format), "GMT")
mydata
}
#############################################################################################
## splits data frame into date chunks. Allows users to supply simple dates and labels
## useful for type = "site", interventions
#' Divide up a data frame by time
#'
#' Utility function to prepare input data for use in openair functions
#'
#' This function partitions a data frame up into different time segments. It
#' produces a new column called controlled by \code{name} that can be used in many
#' \code{openair} functions. Note that there must be one more label than there
#' are dates. See examples below and in full \code{openair} documentation.
#'
#' @param mydata A data frame containing a \code{date} field in hourly or high
#' resolution format.
#' @param dates A date or dates to split data by.
#' @param labels Labels for each time partition.
#' @param name The name to give the new column to identify the periods split
#' @export
#' @author David Carslaw
#' @examples
#'
#' ## split data up into "before" and "after"
#' mydata <- splitByDate(mydata, dates = "1/04/2000",
#' labels = c("before", "after"))
#'
#' ## split data into 3 partitions:
#' mydata <- splitByDate(mydata, dates = c("1/1/2000", "1/3/2003"),
#' labels = c("before", "during", "after"))
#'
#'
splitByDate <- function(mydata, dates = "1/1/2003", labels = c("before", "after"), name = "split.by") {
## if date in format dd/mm/yyyy hh:mm (basic check)
if (missing(mydata)) stop("No data frame was supplied!")
mydata <- checkPrep(mydata, names(mydata), "default", remove.calm = FALSE)
## check there are sufficent labels for number of dates
if (length(dates) != length(labels) - 1) {
stop("There is a mis-match between dates and labels. There should be
one more label than date")
}
if (length(grep("/", as.character(dates))) > 0) {
if (class(mydata$date)[1] == "Date") {
dates <- as_date(as.POSIXct(strptime(dates, "%d/%m/%Y"), "GMT"))
} else {
dates <- as.POSIXct(strptime(dates, "%d/%m/%Y"), "GMT")
}
} else { ## asume format yyyy-mm-dd
if (class(mydata$date)[1] == "Date") {
dates <- as_date(dates)
} else {
dates <- as.POSIXct(dates, "GMT")
}
}
mydata[, name] <- cut(
as.numeric(mydata$date),
breaks = c(
0, as.numeric(dates),
max(mydata$date)
), labels = labels,
ordered_result = TRUE
)
mydata
}
#############################################################################################
## function to make it easy to use d/m/y format for subsetting by date
#' Subset a data frame based on date
#'
#' Utility function to make it easier to select periods from a data frame
#' before sending to a function
#'
#' This function makes it much easier to select periods of interest from a data
#' frame based on dates in a British format. Selecting date/times in R format
#' can be intimidating for new users. This function can be used to select quite
#' complex dates simply - see examples below.
#'
#' Dates are assumed to be inclusive, so \code{start = "1/1/1999"} means that
#' times are selected from hour zero. Similarly, \code{end = "31/12/1999"} will
#' include all hours of the 31st December. \code{start} and \code{end} can also
#' be in standard R format as a string i.e. "YYYY-mm-dd", so \code{start =
#' "1999-01-01"} is fine.
#'
#' All options are applied in turn making it possible to select quite complex
#' dates
#'
#' @param mydata A data frame containing a \code{date} field in hourly or high
#' resolution format.
#' @param start A start date string in the form d/m/yyyy e.g. \dQuote{1/2/1999}
#' or in \sQuote{R} format i.e. \dQuote{YYYY-mm-dd}, \dQuote{1999-02-01}
#' @param end See \code{start} for format.
#' @param year A year or years to select e.g. \code{year = 1998:2004} to select
#' 1998-2004 inclusive or \code{year = c(1998, 2004)} to select 1998 and
#' 2004.
#' @param month A month or months to select. Can either be numeric e.g.
#' \code{month = 1:6} to select months 1-6 (January to June), or by name e.g.
#' \code{month = c("January", "December")}. Names can be abbreviated to 3
#' letters and be in lower or upper case.
#' @param day A day name or or days to select. \code{day} can be numeric (1 to
#' 31) or character. For example \code{day = c("Monday", "Wednesday")} or
#' \code{day = 1:10} (to select the 1st to 10th of each month). Names can be
#' abbreviated to 3 letters and be in lower or upper case. Also accepts
#' \dQuote{weekday} (Monday - Friday) and \dQuote{weekend} for convenience.
#' @param hour An hour or hours to select from 0-23 e.g. \code{hour = 0:12} to
#' select hours 0 to 12 inclusive.
#' @export
#' @author David Carslaw
#' @examples
#'
#' ## select all of 1999
#' data.1999 <- selectByDate(mydata, start = "1/1/1999", end = "31/12/1999")
#' head(data.1999)
#' tail(data.1999)
#'
#' # or...
#' data.1999 <- selectByDate(mydata, start = "1999-01-01", end = "1999-12-31")
#'
#' # easier way
#' data.1999 <- selectByDate(mydata, year = 1999)
#'
#'
#' # more complex use: select weekdays between the hours of 7 am to 7 pm
#' sub.data <- selectByDate(mydata, day = "weekday", hour = 7:19)
#'
#' # select weekends between the hours of 7 am to 7 pm in winter (Dec, Jan, Feb)
#' sub.data <- selectByDate(mydata, day = "weekend", hour = 7:19, month =
#' c("dec", "jan", "feb"))
#'
selectByDate <- function(mydata, start = "1/1/2008",
end = "31/12/2008", year = 2008,
month = 1, day = "weekday", hour = 1) {
## extract variables of interest
vars <- names(mydata)
## check data - mostly date format
mydata <- checkPrep(
mydata, vars, "default",
remove.calm = FALSE,
strip.white = FALSE
)
weekday.names <- format(ISOdate(2000, 1, 3:9), "%A")
if (!missing(start)) {
## assume R date format
start <- as_date(parse_date_time(start, c("ymd", "dmy")))
mydata <- subset(mydata, as_date(date) >= start)
}
if (!missing(end)) {
## assume R date format
end <-as_date(parse_date_time(end, c("ymd", "dmy")))
mydata <- subset(mydata, as_date(date) <= end)
}
if (!missing(year)) {
mydata <- mydata[which(year(mydata$date) %in% year), ]
}
if (!missing(month)) {
if (is.numeric(month)) {
if (any(month < 1 | month > 12)) {
stop("Month must be between 1 to 12.")
}
mydata <- mydata[which(month(mydata$date) %in% month), ]
}
else {
mydata <- subset(mydata, substr(tolower(format(
date,
"%B"
)), 1, 3) %in% substr(tolower(month), 1, 3))
}
}
if (!missing(hour)) {
if (any(hour < 0 | hour > 23)) stop("Hour must be between 0 to 23.")
mydata <- mydata[which(hour(mydata$date) %in% hour), ]
}
if (!missing(day)) {
days <- day
if (is.numeric(day)) {
if (any(day < 1 | day > 31)) {
stop("Day must be between 1 to 31.")
}
mydata <- mydata[which(day(mydata$date) %in% day), ]
} else {
if (day[1] == "weekday") {
days <- weekday.names[1:5]
}
if (day[1] == "weekend") {
days <- weekday.names[6:7]
}
mydata <- subset(mydata, substr(tolower(format(date, "%A")), 1, 3) %in%
substr(tolower(days), 1, 3))
}
}
mydata
}
## from Deepayan Sarkar
panel.smooth.spline <-
function(x, y,
w = NULL, df, spar = NULL, cv = FALSE,
lwd = lwd, lty = plot.line$lty, col, col.line = plot.line$col,
type, horizontal = FALSE, all.knots = TRUE, ...) {
x <- as.numeric(x)
y <- as.numeric(y)
ok <- is.finite(x) & is.finite(y)
if (sum(ok) < 1) {
return()
}
if (!missing(col)) {
if (missing(col.line)) {
col.line <- col
}
}
plot.line <- trellis.par.get("plot.line")
if (horizontal) {
spline <-
smooth.spline(
y[ok], x[ok],
w = w, df = df, spar = spar, cv = cv
)
panel.lines(
x = spline$y, y = spline$x, col = col.line,
lty = lty, lwd = lwd, ...
)
}
else {
spline <-
smooth.spline(
x[ok], y[ok],
w = w, df = df, spar = spar, cv = cv
)
panel.lines(
x = spline$x, y = spline$y, col = col.line,
lty = lty, lwd = lwd, ...
)
}
}
### panel functions for plots based on lattice ####################################################
panel.gam <- function(x, y, form = y ~ x, method = "loess", k = k, Args, ..., simulate = FALSE, n.sim = 200,
autocor = FALSE, se = TRUE,
level = 0.95, n = 100, col = plot.line$col, col.se = col,
lty = plot.line$lty, lwd = plot.line$lwd, alpha = plot.line$alpha,
alpha.se = 0.20, border = NA, subscripts, group.number, group.value,
type, col.line, col.symbol, fill, pch, cex, font, fontface,
fontfamily) {
## panel function to add a smooth line to a plot
## Uses a GAM (mgcv) to fit smooth
## Optionally can plot 95% confidence intervals and run bootstrap simulations
## to estimate uncertainties. Simple block bootstrap is also available for correlated data
## get rid of R check annoyances#
plot.line <- NULL
thedata <- data.frame(x = x, y = y)
thedata <- na.omit(thedata)
tryCatch({
if (!simulate) {
if (is.null(k)) {
mod <- suppressWarnings(gam(y ~ s(x), select = TRUE, data = thedata, ...))
} else {
mod <- suppressWarnings(gam(y ~ s(x, k = k), select = TRUE, data = thedata, ...))
}
lims <- current.panel.limits()
xrange <- c(max(min(lims$x), min(x, na.rm = TRUE)), min(max(lims$x), max(x, na.rm = TRUE)))
xseq <- seq(xrange[1], xrange[2], length = n)
## for uncertainties
std <- qnorm(level / 2 + 0.5)
pred <- predict(mod, data.frame(x = xseq), se = TRUE)
panel.lines(xseq, pred$fit, col = col, alpha = alpha, lty = lty, lwd = 2)
results <- data.frame(
date = xseq, pred = pred$fit,
lower = pred$fit - std * pred$se,
upper = pred$fit + std * pred$se
)
if (se) {
panel.polygon(
x = c(xseq, rev(xseq)), y = c(pred$fit -
std * pred$se, rev(pred$fit + std * pred$se)),
col = col.se, alpha = alpha.se, border = border
)
pred <- pred$fit
}
} else { ## simulations required
x <- thedata$x
y <- thedata$y
sam.size <- length(x)
lims <- current.panel.limits()
xrange <- c(max(min(lims$x), min(x)), min(max(lims$x), max(x)))
xseq <- seq(xrange[1], xrange[2], length = sam.size)
boot.pred <- matrix(nrow = sam.size, ncol = n.sim)
message("Taking bootstrap samples. Please wait...")
## set up bootstrap
block.length <- 1
if (autocor) block.length <- round(sam.size ^ (1 / 3))
index <- samp.boot.block(sam.size, n.sim, block.length)
## predict first
if (is.null(k)) {
mod <- gam(y ~ s(x), data = thedata, ...)
} else {
mod <- gam(y ~ s(x, k = k), data = thedata, ...)
}
residuals <- residuals(mod) ## residuals of the model
pred.input <- predict(mod, thedata)
for (i in 1:n.sim) {
## make new data
new.data <- data.frame(x = xseq, y = pred.input + residuals[index[, i]])
mod <- gam(y ~ s(x), data = new.data, ...)
pred <- predict(mod, new.data)
boot.pred[, i] <- as.vector(pred)
}
## calculate percentiles
percentiles <- apply(boot.pred, 1, function(x) quantile(x, probs = c(0.025, 0.975)))
results <- as.data.frame(cbind(
pred = rowMeans(boot.pred),
lower = percentiles[1, ], upper = percentiles[2, ]
))
if (se) {
panel.polygon(
x = c(xseq, rev(xseq)), y = c(results$lower, rev(results$upper)),
col = col.se, alpha = alpha.se, border = border
)
}
panel.lines(xseq, pred.input, col = col, alpha = alpha, lty = lty, lwd = 2)
}
results
}, error = function(x) return())
}
## version of GAM fitting not for plotting - need to rationalise both...
fitGam <- function(thedata, x = "date", y = "conc", form = y ~ x, k = k,
Args, ..., simulate = FALSE, n.sim = 200, autocor = FALSE, se = TRUE,
level = 0.95, n = 100) {
## panel function to add a smooth line to a plot
## Uses a GAM (mgcv) to fit smooth
## Optionally can plot 95% confidence intervals and run bootstrap simulations
## to estimate uncertainties. Simple block bootstrap is also available for correlated data
data.orig <- thedata ## return this if all else fails
id <- which(names(thedata) == x)
names(thedata)[id] <- "x"
id <- which(names(thedata) == y)
names(thedata)[id] <- "y"
# can only fit numeric, so convert back after fitting
class_x <- class(thedata$x)
thedata$x <- as.numeric(thedata$x)
tryCatch({
if (!simulate) {
if (is.null(k)) {
mod <- suppressWarnings(gam(y ~ s(x), select = TRUE, data = thedata, ...))
} else {
mod <- suppressWarnings(gam(y ~ s(x, k = k), select = TRUE, data = thedata, ...))
}
xseq <- seq(min(thedata$x, na.rm = TRUE), max(thedata$x, na.rm = TRUE), length = n)
## for uncertainties
std <- qnorm(level / 2 + 0.5)
pred <- predict(mod, data.frame(x = xseq), se = se)
results <- data.frame(
date = xseq, pred = pred$fit,
lower = pred$fit - std * pred$se,
upper = pred$fit + std * pred$se
)
} else { ## simulations required
sam.size <- nrow(thedata)
xseq <- seq(min(thedata$x, na.rm = TRUE), max(thedata$x, na.rm = TRUE), length = n)
boot.pred <- matrix(nrow = sam.size, ncol = n.sim)
message("Taking bootstrap samples. Please wait...")
## set up bootstrap
block.length <- 1
if (autocor) block.length <- round(sam.size ^ (1 / 3))
index <- samp.boot.block(sam.size, n.sim, block.length)
## predict first
if (is.null(k)) {
mod <- gam(y ~ s(x), data = thedata, ...)
} else {
mod <- gam(y ~ s(x, k = k), data = thedata, ...)
}
residuals <- residuals(mod) ## residuals of the model
pred.input <- predict(mod, thedata)
for (i in 1:n.sim) {
## make new data
new.data <- data.frame(x = xseq, y = pred.input + residuals[index[, i]])
mod <- gam(y ~ s(x), data = new.data, ...)
pred <- predict(mod, new.data)
boot.pred[, i] <- as.vector(pred)
}
## calculate percentiles
percentiles <- apply(boot.pred, 1, function(x) quantile(x, probs = c(0.025, 0.975)))
results <- as.data.frame(cbind(
pred = rowMeans(boot.pred),
lower = percentiles[1, ], upper = percentiles[2, ]
))
}
# convert class back to original
class(results[[x]]) <- class_x
return(results)
}, error = function(x) {
data.orig
})
}
#########################################################################################################
## error in mean from Hmisc
errorInMean <- function(x, mult = qt((1 + conf.int) / 2, n - 1), conf.int = 0.95,
na.rm = TRUE) {
if (na.rm) {
x <- x[!is.na(x)]
}
n <- length(x)
if (n < 2) {
return(c(Mean = mean(x), Lower = NA, Upper = NA))
}
xbar <- sum(x) / n
se <- sqrt(sum((x - xbar) ^ 2) / n / (n - 1))
c(Mean = xbar, Lower = xbar - mult * se, Upper = xbar + mult *
se)
}
## bootsrap confidence intervals in the mean from Hmisc
bootMean <- function(x, conf.int = 0.95, B = 1000, ...) {
x <- x[!is.na(x)] # remove missings
n <- length(x)
xbar <- mean(x)
if (n < 2) {
return(c(Mean = xbar, Lower = NA, Upper = NA))
}
z <- unlist(lapply(1:B, function(i, x, N)
sum(x[(sample.int(N, N, TRUE, NULL))]), x = x, N = n)) / n
quant <- quantile(z, c((1 - conf.int) / 2, (1 + conf.int) / 2))
names(quant) <- NULL
res <- c(Mean = xbar, Lower = quant[1], Upper = quant[2])
res
}
#' Bootsrap confidence intervals in the mean
#'
#' A utility function to calculation the uncertainty intervals in the mean of a
#' vector. The function removes any missing data before the calculation.
#'
#' @param x A vector from which the mean and bootstrap confidence intervals in
#' the mean are to be calculated
#' @param conf.int The confidence interval; default = 0.95.
#' @param B The number of bootstrap simulations
#'
#' @return Returns a data frame with the mean, lower uncertainty, upper
#' uncertainty and number of values used in the calculation
#' @export
#'
#' @examples
#' test <- rnorm(20, mean = 10)
#' bootMeanDF(test)
bootMeanDF <- function(x, conf.int = 0.95, B = 1000) {
if (!is.vector(x)) {
stop("x should be a vector.")
}
res <- bootMean(x = x, conf.int = conf.int, B = B)
res <- data.frame(mean = res[1], min = res[2], max = res[3], n = length(na.omit(x)))
res <- return(res)
}
bootMeanDiff <- function(mydata, x = "x", y = "y", conf.int = 0.95, B = 1000, na.rm = FALSE) {
## calculates bootstrap mean differences
## assumes y - x
x.name <- x
y.name <- y
x <- na.omit(mydata[[x]])
y <- na.omit(mydata[[y]])
Mean <- mean(y) - mean(x)
if (nrow(mydata) < 2 | is.na(Mean)) {
res1 <- data.frame(variable = x.name, Mean = mean(x), Lower = NA, Upper = NA,
stringsAsFactors = FALSE)
res2 <- data.frame(variable = y.name, Mean = mean(y), Lower = NA, Upper = NA,
stringsAsFactors = FALSE)
res <- data.frame(variable = paste(y.name, "-", x.name), Mean = Mean,
Lower = NA, Upper = NA,
stringsAsFactors = FALSE)
res <- bind_rows(res1, res2, res)
res$variable <- factor(res$variable)
return(res)
}
x <- bootMean(x, B = B)
y <- bootMean(y, B = B)
quant1 <- quantile(x, c((1 - conf.int) / 2, (1 + conf.int) / 2), na.rm = na.rm)
quant2 <- quantile(y, c((1 - conf.int) / 2, (1 + conf.int) / 2), na.rm = na.rm)
quant <- quantile(y - x, c((1 - conf.int) / 2, (1 + conf.int) / 2), na.rm = na.rm)
names(quant1) <- NULL
names(quant2) <- NULL
names(quant) <- NULL
res1 <- data.frame(variable = x.name, Mean = mean(x),
Lower = quant1[1], Upper = quant1[2],
stringsAsFactors = FALSE)
res2 <- data.frame(variable = y.name, Mean = mean(y), Lower = quant2[1],
Upper = quant2[2],
stringsAsFactors = FALSE)
res <- data.frame(variable = paste(y.name, "-", x.name),
Mean = Mean, Lower = quant[1], Upper = quant[2],
stringsAsFactors = FALSE)
res <- bind_rows(res1, res2, res)
res$variable <- factor(res$variable)
res
}
###########################################################################################################
## list update function
## for lattice type object structure and ... handling
## (currently used by)
## (all openair plots that include colorkey controlled by drawOpenKey)
## listUpdate function
# [in development]
listUpdate <- function(a, b, drop.dots = TRUE,
subset.a = NULL, subset.b = NULL) {
if (drop.dots) {
a <- a[names(a) != "..."]
b <- b[names(b) != "..."]
}
if (!is.null(subset.a)) {
a <- a[names(a) %in% subset.a]
}
if (!is.null(subset.b)) {
b <- b[names(b) %in% subset.b]
}
if (length(names(b) > 0)) {
a <- modifyList(a, b)
}
a
}
#############################################################################################################
## makeOpenKeyLegend v0.1
## common code for making legend list
## objects for use with drawOpenkey outputs
## uses listUpdate in utilities
makeOpenKeyLegend <- function(key, default.key, fun.name = "function") {
# handle logicals and lists
if (is.logical(key)) {
legend <- if (key) default.key else NULL
} else if (is.list(key)) {
legend <- listUpdate(default.key, key)
} else {
if (!is.null(key)) {
warning(
paste(
"In ", fun.name, "(...):\n unrecognised key not exported/applied\n",
" [see ?drawOpenKey for key structure/options]",
sep = ""
),
call. = FALSE
)
}
legend <- NULL
}
# structure like legend for drawOpenKey
if (!is.null(legend)) {
legend <- list(right = list(
fun = drawOpenKey, args = list(key = legend),
draw = FALSE
))
if ("space" %in% names(legend$right$args$key)) {
names(legend)[[1]] <- legend$right$args$key$space
}
}
legend
}
## polygon that can deal with missing data for use in lattice plots with groups
poly.na <- function(x1, y1, x2, y2, group.number, myColors, alpha = 0.4, border = NA) {
for (i in seq(2, length(x1)))
if (!any(is.na(y2[c(i - 1, i)]))) {
lpolygon(
c(x1[i - 1], x1[i], x2[i], x2[i - 1]),
c(y1[i - 1], y1[i], y2[i], y2[i - 1]),
col = myColors[group.number], border = border, alpha = alpha
)
}
}
## gives names of lattice strips
strip.fun <- function(results.grid, type, auto.text) {
## proper names of labelling ###################################################
pol.name <- sapply(
levels(factor(results.grid[[type[1]]])),
function(x) quickText(x, auto.text)
)
strip <- strip.custom(factor.levels = pol.name)
if (length(type) == 1) {
strip.left <- FALSE
} else { ## two conditioning variables
pol.name <- sapply(
levels(factor(results.grid[[type[2]]])),
function(x) quickText(x, auto.text)
)
strip.left <- strip.custom(factor.levels = pol.name)
}
if (length(type) == 1 & type[1] == "default") strip <- FALSE ## remove strip
list(strip, strip.left, pol.name)
}
## from lattice
chooseFace <- function(fontface = NULL, font = 1) {
if (is.null(fontface)) {
font
} else {
fontface
}
}
## .smoothScatterCalcDensity() is also in graphics, but not exported.
.smoothScatterCalcDensity <- function(x, nbin, bandwidth, range.x) {
if (!("KernSmooth" %in% loadedNamespaces())) {
ns <- try(loadNamespace("KernSmooth"))
if (isNamespace(ns)) {
message("(loaded the KernSmooth namespace)")
} else {
stop("panel.smoothScatter() requires the KernSmooth package, but unable to load KernSmooth namespace")
}
}
if (length(nbin) == 1) {
nbin <- c(nbin, nbin)
}
if (!is.numeric(nbin) || (length(nbin) != 2)) stop("'nbin' must be numeric of length 1 or 2")
if (missing(bandwidth)) {
bandwidth <- diff(apply(x, 2, quantile, probs = c(0.05, 0.95), na.rm = TRUE)) / 25
} else {
if (!is.numeric(bandwidth)) stop("'bandwidth' must be numeric")
}
bandwidth[bandwidth == 0] <- 1
## create density map
if (missing(range.x)) {
rv <- KernSmooth::bkde2D(x, gridsize = nbin, bandwidth = bandwidth)
} else {
rv <- KernSmooth::bkde2D(x, gridsize = nbin, bandwidth = bandwidth, range.x = range.x)
}
rv$bandwidth <- bandwidth
return(rv)
}
## simple rounding function from plyr
round_any <- function(x, accuracy, f = round) {
f(x / accuracy) * accuracy
}
## pretty gap calculator
prettyGap <- function(x, n = 100) {
return(diff(pretty(x, n))[1])
}
# function to check variables are numeric, if not force with warning
checkNum <- function(mydata, vars) {
for (i in seq_along(vars)) {
if (!is.numeric(mydata[[vars[i]]])) {
mydata[[vars[i]]] <- as.numeric(as.character(mydata[[vars[i]]]))
warning(
paste(vars[i], "is not numeric, forcing to numeric..."),
call. = FALSE
)
}
}
return(mydata)
}
#' Bin data, calculate mean and bootstrap 95 % confidence interval in the mean
#'
#' Bin a variable and calculate mean an uncertainties in mean
#'
#' This function summarises data by intervals and calculates the mean and
#' bootstrap 95 % confidence intervals in the mean of a chosen variable in a
#' data frame. Any other numeric variables are summarised by their mean
#' intervals.
#'
#' There are three options for binning. The default is to bon \code{bin} into 40
#' intervals. Second, the user can choose an binning interval e.g.
#' \code{interval = 5}. Third, the user can supply their own breaks to use as
#' binning intervals.
#'
#' @param mydata Name of the data frame to process.
#' @param bin The name of the column to divide into intervals
#' @param uncer The name of the column for which the mean, lower and upper
#' uncertainties should be calculated for each interval of \code{bin}.
#' @param n The number of intervals to split \code{bin} into.
#' @param interval The interval to be used for binning the data.
#' @param breaks User specified breaks to use for binning.
#'
#' @return Returns a summarised data frame with new columns for the mean and
#' upper / lower 95 percent confidence intervals in the mean.
#' @export
#'
#' @examples
#' # how does nox vary by intervals of wind speed?
#' results <- binData(mydata, bin = "ws", uncer = "nox")
#'
#' # easy to plot this using ggplot2
#' \dontrun{
#' library(ggplot2)
#' ggplot(results, aes(ws, mean, ymin = min, ymax = max)) +
#' geom_pointrange()
#' }
binData <- function(mydata, bin = "nox", uncer = "no2", n = 40, interval = NA,
breaks = NA) {
if (!is.na(interval)) {
mydata$interval <- cut(
mydata[[bin]], sort(unique(round_any(mydata[[bin]], interval))),
include.lowest = TRUE
)
} else if (!anyNA(breaks)) {
mydata$interval <- cut(mydata[[bin]], breaks = breaks, include.lowest = TRUE)
} else {
mydata$interval <- cut(mydata[[bin]], breaks = n)
}
# remove any missing intervals
id <- which(is.na(mydata$interval))
if (length(id) > 0)
mydata <- mydata[-id, ]
# calculate 95% CI in mean
uncert <- group_by(mydata, interval) %>%
do(bootMeanDF(.[[uncer]], B = 250))
mydata <- group_by(mydata, interval) %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
mydata <- inner_join(mydata, uncert, by = "interval")
mydata
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.