# To do:
# - iccf.core: bring approx() interpolation outside main loop (speed)
# - add more test data, unit tests
# - use ACF(0) to normalise CCF(0) to ensure ACF(0)=1...?
# -----------------------------------------------------------
#' Estimate cross correlation of unevenly sampled time series.
#'
#' \code{cross_correlate} returns cross correlation data for two times series.
#'
#' Function for estimating the cross-correlation between two time series which
#' may be irregularly and/or non-simultaneously sampled. The CCF is computed
#' using one of two methods: (1) the Discrete Correlation Function (DCF; Edelson
#' & Krolik 1988) or (2) the Interpolated Cross Correlation Function (ICCF;
#' Gaskell & Sparke 1986). You can also produce estimates of uncertainty on the
#' CCF, its peak and centroid using the Flux Randomisation and Random Subsample
#' Selection (FR/RSS) method of Peterson et al. (1998).
#'
#' @param ts.1,ts.2 (array or dataframe) data for time series 1 and 2.
#' @param method (string) use \code{"dcf"} or \code{"iccf"} (default).
#' @param max.lag (float) maximum lag at which to compute the CCF.
#' @param min.pts (integer) each DCF bin must contain at least \code{min.pts} correlation coefficients.
#' @param dtau (float) spacing of the time delays (\code{tau}) which which CCF is estimated.
#' @param local.est (logical) use 'local' (not 'global') means and variances?
#' @param zero.clip (logical) remove pairs of points with exactly zero lag?
#' @param use.errors (logical) if \code{TRUE} then subtract mean square error from variances.
#' @param one.way (logical) (ICCF only) if TRUE then only interpolar time series 2.
#' @param cov (logical) if \code{TRUE} then compute covariance, not correlation coefficient.
#' @param prob (logical) probability level to use for confidence intervals
#' @param nsim (integer) number of FR/RSS simulations to run
#' @param peak.frac (float) only include CCF points above \code{peak.frac}*max(ccf) in centroid calculation.
#' @param chatter (integer) set the level of feedback.
#'
#' @return
#' A list with components
#' \item{tau}{(array) A one dimensional array containing the lags at which the CCF is estimated.}
#' \item{ccf}{(array) An array with the same dimensions as lag containing the estimated CCF.}
#' \item{lower}{(array) Lower limit of CCF (see Notes).}
#' \item{upper}{(array) Upper limit of CCF (see Notes).}
#' \item{peak.dist}{(array) A array of length \code{nsim} containing the CCF peaks from the simulations.}
#' \item{cent.dist}{(array) A array of length \code{nsim} containing the CCF centroids from the simulations.}
#' \item{method}{(string) which method was used? \code{"iccf"} or \code{"dcf"}}
#' The value of \code{ccf[k]} returns the estimated correlation between
#' \code{ts.1$y}(t+tau) and \code{ts.2$y}(t) where tau = \code{tau[k]}. A
#' strong peak at negative lags indicates the \code{ts.1} leads \code{ts.2}.
#'
#' @section Notes:
#' If only one time series is given as input then the Auto-Correlation Function
#' (ACF) is computed.
#'
#' Input data frames: note that the input data \code{ts.1} and \code{ts.2} are
#' not traditional \code{R} time series objects. Such objects are only suitable
#' for regularly sampled data. These CCF functions are designed to work with
#' data of arbitrary sampling, we therefore need to explicitly list times and
#' values. The input objects are therefore data.frames with at least two columns
#' which much be called \code{t} (time) and \code{y} (value). An error of the
#' value may be provided by a \code{dy} column. Any other columns are ignored.
#'
#' Local vs. global estimation: If \code{local.est = FALSE} (default) then the
#' correlation coefficient is computed sing the 'global' mean and variance of
#' each time series. If \code{local.est = TRUE} then the correlation coefficient
#' is computed using the 'local' mean and variance. For each lag, the mean to be
#' subtrated and the varaince to be divided are computed using only data points
#' contributing to that lag.
#'
#' Simulations: Performs "flux randomisation" and "random sample selection" of
#' an input time series, following Peterson et al. (2004, ApJ, 613:682-699).
#' Given an input data series \code{(t, y, dy)} of length \code{N} we sample
#' \code{N} points with replacement. Duplicated points are ignored, so the
#' ouptut is usually shorter than the input. So far this is a basic bootstrap
#' procedure.
#'
#' If error bars are provided: when a point is selected \code{m} times, we
#' decrease the error by \code{1/sqrt(m)}. See Appendix A of Peterson et al. And
#' after resampling in time, we then add a random Gaussian deviate to each
#' remaining data point, with std.dev equal to its error bar. In this way both
#' the times and values are randomised. If errors bars are not provided, this is
#' a simple bootstrap.
#'
#' Peak and centroid: from the simulations we record the CCF peak and its
#' centroid. The centroid is the mean of \code{tau*ccf/sum(ccf)} including all
#' points for which \code{ccf} is higher than \code{peak.frac} of
#' \code{max(ccf)}.
#'
#' Upper/lower limits and distributions: If no simulations are used (\code{nsim
#' = 0}) then upper/lower confidence limits on the CCF are estimated using the
#' method of Barlett (1955) based on the two ACFs. If simulations are used, the
#' confidence limits are based on the simulations.
#'
#' @seealso \code{\link[stats]{ccf}}, \code{\link{fr_rss}}
#'
#' @examples
#' ## Example using NGC 5548 data
#' result <- cross_correlate(cont, hbeta, dtau = 1, max.lag = 550)
#' plot(result$tau, result$ccf, type = "l", bty = "n", xlab = "time delay", ylab = "CCF")
#' grid()
#'
#' ## or using the DCF method
#' result <- cross_correlate(cont, hbeta, method = "dcf", dtau = 5, max.lag = 350)
#' lines(result$tau, result$ccf, col = "red")
#'
#' ## Examples from Venables & Ripley
#' require(graphics)
#' tsf <- data.frame(t = time(fdeaths), y = fdeaths)
#' tsm <- data.frame(t = time(mdeaths), y = mdeaths)
#'
#' ## compute CCF using ICCF method
#' result <- cross_correlate(tsm, tsf, method = "iccf")
#' plot_ccf(result)
#'
#' ## compute CCF using standard method (stats package) and compare
#' result.st <- ccf(mdeaths, fdeaths, plot = FALSE)
#' lines(result.st$lag, result.st$acf, col="red", lwd = 3)
#'
#' @export
cross_correlate <- function(ts.1, ts.2,
method = "iccf",
max.lag = NULL,
min.pts = 5,
dtau = NULL,
local.est = FALSE,
zero.clip = NULL,
use.errors = FALSE,
one.way = FALSE,
cov = FALSE,
prob = 0.1,
nsim = 0,
peak.frac = 0.8,
chatter = 0) {
# check arguments
if (missing(ts.1)) stop('Missing ts.1 data frame.')
# if only one time series then duplicate (compute ACF)
acf.flag <- FALSE
if (missing(ts.2)) {
acf.flag <- TRUE
ts.2 <- ts.1
}
# check the contents of the input time series
if (!exists("t", where = ts.1)) stop('Missing column t in ts.1.')
if (!exists("y", where = ts.1)) stop('Missing column y in ts.1.')
if (!exists("t", where = ts.2)) stop('Missing column t in ts.2.')
if (!exists("y", where = ts.2)) stop('Missing column y in ts.2.')
# strip out bad data (any NA, NaN or Inf values)
goodmask.1 <- is.finite(ts.1$t) & is.finite(ts.1$y)
ts.1 <- ts.1[goodmask.1, ]
t.1 <- ts.1$t
n.1 <- length(t.1)
goodmask.2 <- is.finite(ts.2$t) & is.finite(ts.2$y)
ts.2 <- ts.2[goodmask.2, ]
t.2 <- ts.2$t
n.2 <- length(t.2)
# warning if there's too little data to bother proceeding
if (n.1 <= 5) stop('ts.1 is too short.')
if (n.2 <= 5) stop('ts.2 is too short.')
# if max.tau not set, set it to be 1/4 the max-min time range
if (is.null(max.lag))
max.lag <- (max(c(t.1, t.2)) - min(c(t.1, t.2))) * 0.25
# if dtau is not defined, set to default
if (is.null(dtau))
dtau <- min( diff(t.1) )
# total number of lag bins; make sure its odd!
n.tau <- ceiling(2 * max.lag / dtau)
if (n.tau %% 2 == 0)
n.tau <- n.tau + 1
lag.bins <- round( (n.tau - 1) / 2 )
if (lag.bins < 4) stop('You have too few lag bins.')
# now adjust max.lag so that -max.lag to +max.lag spans odd number of
# dtau bins
max.lag <- (1/2) * dtau * (n.tau-1)
# define the vector of lag bins. tau is the centre of each bin.
# should extend from -max.tau to +max.tau, centred on zero.
tau <- seq(-max.lag, max.lag, by = dtau)
# optional feedback for the user
if (chatter > 0) {
cat('-- lag.bins:', lag.bins, fill = TRUE)
cat('-- max.lag:', max.lag, fill = TRUE)
cat('-- n.tau:', n.tau, fill = TRUE)
cat('-- length(tau)', length(tau), fill = TRUE)
cat('-- dtau:', dtau, fill = TRUE)
cat('-- length ts.1:', n.1, ' dt:', diff(t.1[1:2]), fill = TRUE)
cat('-- length ts.2:', n.2, ' dt:', diff(t.2[1:2]), fill = TRUE)
}
# compute CCF for the input data
ccf.out <- NA
if (method == "dcf") {
ccf.out <- dcf(ts.1, ts.2, tau,
local.est = local.est, min.pts = min.pts,
zero.clip = zero.clip, chatter = chatter, cov = cov)
} else {
ccf.out <- iccf(ts.1, ts.2, tau,
local.est = local.est, chatter = chatter,
cov = cov, one.way = one.way, zero.clip = zero.clip)
}
# extract the lag settings
max.lag <- max(ccf.out$tau)
nlag <- length(ccf.out$tau)
lag.bins <- (nlag-1)/2
dtau <- diff(ccf.out$tau[1:2])
if (chatter > 0) {
cat('-- max.lag: ', max.lag, fill = TRUE)
cat('-- nlag: ', nlag, fill = TRUE)
cat('-- lag.bins: ', lag.bins, fill = TRUE)
cat('-- dtau: ', dtau, fill = TRUE)
}
# set up blanks if no simulations are run
lower <- NA
upper <- NA
cent.dist <- NA
peak.dist <- NA
# (optional) run simulations to compute errors
if (nsim > 0) {
# check we have enough simulations
if (nsim < 2/max(prob))
stop('Not enough simulations. Make nsim or prob larger.')
# run some simulations
sims <- ccf_errors(ts.1, ts.2, tau, nsim = nsim,
method = method, peak.frac = peak.frac, min.pts = min.pts,
local.est = local.est, zero.clip = zero.clip, prob = prob,
cov = cov, chatter = chatter, acf.flag = acf.flag,
one.way = one.way)
lower <- sims$lags$lower
upper <- sims$lags$upper
cent.dist <- sims$dists$cent.lag
peak.disk <- sims$dists$peak.lag
} else {
if (method == "dcf") {
acf.1 <- dcf(ts.1, ts.1, tau,
local.est = local.est,
min.pts = min.pts,
chatter = chatter, cov = cov)
acf.2 <- dcf(ts.2, ts.2, tau,
local.est = local.est,
min.pts = min.pts,
chatter = chatter, cov = cov)
} else {
acf.1 <- iccf(ts.1, ts.1, tau,
local.est = local.est,
chatter = chatter,
cov = cov,
one.way=one.way)
acf.2 <- iccf(ts.2, ts.2, tau,
local.est = local.est,
chatter = chatter,
cov = cov,
one.way = one.way)
}
sigma <- sqrt( (1/ccf.out$n) * sum(acf.1$ccf*acf.2$ccf) )
lower <- ccf.out$ccf - sigma
upper <- ccf.out$ccf + sigma
}
# return output
result <- list(tau = ccf.out$tau,
ccf = ccf.out$ccf,
lower = lower,
upper = upper,
peak.dist = peak.dist,
cent.dist = cent.dist,
method = method)
return(result)
}
# -----------------------------------------------------------
#' Plot CCF output from cross_correlate
#'
#' \code{plot_ccf} generates a summary plot of CCF data.
#'
#' @param ccf A list containing \code{tau} and \code{ccf} values as produced by \code{cross_correlate}.
#' @param ... Any other arguments for the \code{plot()} function.
#'
#' @return None.
#'
#' @seealso \code{\link{cross_correlate}}
#'
#' @export
plot_ccf <- function(ccf, ...) {
# plot the CCF
max.lag <- max(ccf$tau)
plot(0, 0, type = "n", lwd = 2, bty = "n", ylim = c(-1, 1),
xlim = c(-1, 1 )* max.lag, xlab = "lag", ylab = "CCF", ...)
grid(col = "lightgrey")
dtau <- diff(ccf$tau[1:2])
if (ccf$method == "dcf") {
lines(ccf$tau-dtau/2, ccf$ccf, type="s", lwd = 2, col = "blue")
} else {
lines(ccf$tau, ccf$ccf, col = "blue", lwd = 3)
}
# plot confidence bands
pnk <- rgb(255, 192, 203, 100, maxColorValue = 255)
indx <- is.finite(ccf$ccf)
polygon(c(ccf$tau[indx], rev(ccf$tau[indx])),
c(ccf$lower[indx], rev(ccf$upper[indx])),
col = pnk, border = NA)
if (ccf$method == "dcf") {
lines(ccf$tau-dtau/2, ccf$ccf, type = "s", lwd = 2, col = "blue")
} else {
lines(ccf$tau, ccf$ccf, col = "blue", lwd = 3)
}
}
# -----------------------------------------------------------
# fr_rss
# History:
# 21/03/16 - First working version
#
# (c) Simon Vaughan, University of Leicester
# -----------------------------------------------------------
#' Perform flux randomisation/random subset section on input data.
#'
#' \code{fr_rss} returns a randomise version of an input data array.
#'
#' Performs "flux randomisation" and "random sample selection"
#' of an input time series, following Peterson et al. (2004, ApJ, v613, pp682-699).
#' This is essentially a bootstrap for a data vector.
#'
#' @param dat (data frame) containing columns \code{t, y} and (optionally)
#' \code{dy}.
#'
#' @return
#' A data frame containing columns
#' \item{t}{time bins for randomised data}
#' \item{y}{values for randomised data}
#' \item{dy}{errors for randomised data}
#'
#' @section Notes:
#' Given an input data series (\code{t, y, dy}) of length \code{N} we sample
#' \code{N} points with replacement. Duplicated points are ignored, so the
#' ouptut is usually shorter than the input. So far this is a basic bootstrap
#' procedure.
#'
#' If error bars are provided: when a point is selected \code{m} times, we
#' decrease the error, scaling by \code{1/sqrt(m)}. See Appendix A of Peterson
#' et al. After resampling, we then add a random Gaussian deviate to each
#' remaining data point, with std.dev equal to its (new) error bar. If errors
#' bars are not provided, this is a simple bootstrap (no randomisation of
#' \code{y}).
#'
#' @seealso \code{\link{cross_correlate}}, \code{\link{ccf_errors}}
#'
#' @examples
#' ## Example using the NGC 5548 data
#' plot(cont$t, cont$y, type="l", bty = "n", xlim = c(50500, 52000))
#' rcont <- fr_rss(cont)
#' lines(rcont$t, rcont$y, col = "red")
#'
#' ## Examples from Venables & Ripley
#' require(graphics)
#' plot(fdeaths, bty = "n")
#' tsf <- data.frame(t = time(fdeaths), y = fdeaths)
#' rtsf <- fr_rss(tsf)
#' lines(rtsf$t, rtsf$y, col="red", type="o")
#'
#' @export
fr_rss <- function(dat) {
# check arguments
if (missing(dat)) stop('Missing DAT argument')
# extract data
times <- dat$t
x <- dat$y
n <- length(times)
# if no errors are provided, we will do a simple bootstrat
if (exists("dy", where = dat)) {
bootstrap <- FALSE
dx <- dat$dy
} else {
bootstrap <- TRUE
dx <- 0
}
if (length(dx) < n) dx <- rep(dx[1], n)
# ignore any data with NA value
mask <- is.finite(x) & is.finite(dx)
times <- times[mask] # time
x <- x[mask] # value
dx <- dx[mask] # error
n <- length(x)
# randomly sample the data
indx <- sample(1:n, n, replace=TRUE)
# identify which points are sampled more than once
duplicates <- duplicated(indx)
indx.clean <- indx[!duplicates]
# where data points are selected n>0 times, scale the error by 1/sqrt(n)
n.repeats <- hist(indx, plot=FALSE, breaks=0:n+0.5)$counts
dx.original <- dx
dx <- dx / sqrt( pmax.int(1, n.repeats) )
# new data are free from duplicates, and have errors decreased where
# points are selected multiple times.
times.new <- times[indx.clean]
x.new <- x[indx.clean]
dx.new <- dx[indx.clean]
n <- length(x.new)
# now randomise the fluxes at each data point
if (bootstrap == FALSE) {
x.new <- rnorm(n, mean=x.new, sd=dx.new)
}
# sort into time order
indx <- order(times.new)
# return the ouput data
return(data.frame(t=times.new[indx], y=x.new[indx], dy=dx.new[indx]))
}
# -----------------------------------------------------------
# ccf_errors
# History:
# 21/03/16 - v1.0 - First working version
# 05/04/16 - v1.1 - added na.rm option to strip out non-finite values
# 09/04/16 - v1.2 - added use.errors option; minor fixes
# 23/07/16 - v1.3 - minor change to handling of centroid calculation.
# if the CCF is entirely <0 then return NA for
# centroid.
#
# (c) Simon Vaughan, University of Leicester
# -----------------------------------------------------------
#' Use random simulations to estimate undertainty on CCF estimates.
#'
#' \code{ccf_errors} returns information on the undertainty on CCF estimates.
#'
#' Computes errors on the CCF estimates using "flux randomisation"
#' and "random subset sampling" FR/RSS using the \code{fr_rss} function.
#'
#' @param tau (array) list of lags at which the CCF is to be evaluated.
#' @param acf.flag (logical) \code{TRUE} when computing ACF,
#' and \code{ts.2 = ts.1}
#' @inheritParams cross_correlate
#'
#' @return
#' The output is a list containing two data frames: \code{lags} and \code{dists}.
#' \item{lags}{a data frame with four columns}
#' \item{tau}{time lags}
#' \item{dcf}{the DCF values for the input data}
#' \item{lower}{the lower limit of the confidence interval}
#' \item{upper}{the upper limit of the confidence interval}
#' \item{dists}{a data frame with two columns}
#' \item{peak.lag}{the peak values from nsim simulations}
#' \item{cent.lag}{the centroid values from nsim simulations}
#'
#' @section Notes:
#' For each randomised pair of light curves we compute the CCF. We record the
#' CCF, the lag at the peak, and the centroid lag (including only points higher
#' than \code{peak.frac * max(ccf)}). Using \code{nsim} simulations we compute
#' the \code{(1-p)*100\%} confidence intervals on the CCF values, and the
#' distribution of the peaks and centroids.
#'
#' @seealso \code{\link{cross_correlate}}, \code{\link{fr_rss}}
#'
#' @export
ccf_errors <- function(ts.1, ts.2,
tau = NULL,
min.pts = 5,
local.est = FALSE,
cov = FALSE,
prob = 0.1,
nsim = 250,
peak.frac = 0.8,
zero.clip = NULL,
one.way = FALSE,
method = "iccf",
use.errors = FALSE,
acf.flag = FALSE,
chatter = 0) {
# check arguments
if (missing(ts.1)) stop('Missing ts.1 data frame.')
if (is.null(tau)) stop('Missing tau in.')
# if only one time series then duplicate (compute ACF)
acf.flag <- FALSE
if (missing(ts.2)) {
acf.flag <- TRUE
ts.2 <- ts.1
}
if (peak.frac > 1 | peak.frac < 0)
stop('peak.frac should be in the range 0-1')
if (prob > 1 | prob < 0)
stop('prob should be in the range 0-1')
nlag <- length(tau)
# set up an array for the simulated DCFs
ccf.sim <- array(NA, dim=c(nsim, nlag))
peak.lag <- array(NA, dim=nsim)
cent.lag <- array(NA, dim=nsim)
# loop over simulations
for (i in 1:nsim) {
# generate randomised data
ts1.sim <- fr_rss(ts.1)
if (acf.flag == FALSE) {
ts2.sim <- fr_rss(ts.2)
} else {
ts2.sim <- ts1.sim
}
# compute CCF of randomised data
if (method == "dcf") {
result.sim <- dcf(ts1.sim, ts2.sim, tau,
local.est = local.est,
min.pts = min.pts,
cov = cov,
zero.clip = zero.clip,
use.errors = use.errors)
} else {
result.sim <- iccf(ts1.sim, ts2.sim, tau,
zero.clip = zero.clip,
local.est = local.est,
cov = cov,
one.way = one.way)
}
ccf.sim[i,] <- result.sim$ccf
# find and store the peak of the CCF
peak <- which.max(result.sim$ccf)
peak.lag[i] <- result.sim$tau[peak]
# find and store the centroid of the CCF
# if the CCF peak is <0 then return NA
if (max(result.sim$ccf, na.rm = TRUE) > 0) {
mask <- which( result.sim$ccf >= peak.frac * max(result.sim$ccf, na.rm = TRUE) )
cent.lag[i] <- sum(result.sim$ccf[mask]*result.sim$tau[mask], na.rm = TRUE) /
sum(result.sim$ccf[mask], na.rm = TRUE)
}
if (chatter > 0) cat("\r Processed", i, "of", nsim, "simulations")
}
if (chatter > 0) cat(fill = TRUE)
# now compute the prob/2 and 1-prob/2 quantiles at each lag
ccf.lims <- array(NA, dim = c(nlag, 2))
probs <- c(prob/2, 1-prob/2)
for (i in 1:nlag) {
ccf.lims[i,] <- quantile(ccf.sim[,i], probs = probs, na.rm = TRUE)
}
# package the DCF: lag, values, lower and upper limits
lags <- data.frame(tau = result.sim$tau, lower = ccf.lims[,1], upper = ccf.lims[,2])
# package the peak and centroid data
dists <- data.frame(peak.lag = peak.lag, cent.lag = cent.lag)
# interval of centroids
if (chatter > 0)
cat('-- ', signif(100*(1-prob), 3), '% lag interval ',
quantile(cent.lag, probs[1], na.rm = TRUE), ' - ',
quantile(cent.lag, probs[2], na.rm = TRUE), fill = TRUE, sep = "")
# return data back to calling function
return(list(lags = lags, dists = dists))
}
# -----------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.