#' @title cor_fun
#' @description Calculates either the cross-correlation function for two variables or the auto-correlation function for a single variable. It can be used to calculate values with bounded periodicity (i.e. the values wrap around'). It uses the population level standard deviation; as such, for self-repeating patterns the ACF does not decay over time.
#' @details To autocorrelate a single variable, just pass x. To cross-correlate two variables, pass both x and y.
#' @param x a vector of numeric values.
#' @param y a vector of numeric values #optional input.
#' @param lag.max the total number of lag times that cross-correlation or autocorrelation will be calculated for.
#' @param wrap determines whether bounded periodicty should be used (wrap=T). Bounded periodicity means that the values 'wrap round'. Usually, not required.
#' @param level determines whether correlation is calculated using the population-level correlation denominator of 'n' or the sample-level correlation demoninator of 'n-1'. Level only accepts values of 'population' or 'sample'. By default it is set to 'sample'.
#' @return cor.out object is returned. There are nine variables that can be accessed: cor, lag, series, x, y, fit_best, wrap, type, level, conf.int.
#' @export
# NOTES
# http://stats.stackexchange.com/questions/254227/manual-calculation-of-acf-of-a-time-series-in-r-close-but-not-quite
#x = df1$x
#y=df1$y
#lag.max=200
#type = 'CCF'
cor_fun = function(x, y=NULL, lag.max=200, wrap=F, level='sample'){
# if y is missing, then make y=x (this shows that the user wants to do autocorrelation)
# automatically generate the 'type' based on the number of input variables
if(missing(y)) {y = x
type = 'ACF'} else{
type = 'CCF'
}
# get the names of the variables
series_x = deparse(substitute(x))
series_y = deparse(substitute(y))
# check variables are correctly formatted
if (!is.numeric(x) || !is.numeric(y))
stop("both 'x' and 'y' must be numeric")
if (length(x) != length(y))
stop("'x' and 'y' must be of equal lengths")
if (length(x) < 15)
stop("there must be at least 15 values in the vectors passed to cor_fun")
# if (type != 'CCF' && type != 'ACF')
# stop("'type' can only take either 'CCF' or 'ACF'. Please note that by default it takes 'CCF'")
if (level != 'population' && level != 'sample')
stop("'level' can only take either 'population' or 'sample'. Please note that by default it takes 'sample'")
# convert x and y to matrixes
x = as.matrix(x)
y = as.matrix(y)
# check vector length
# (I think this code might be redundant now as it is already covered in the code above)
vector_length = as.integer(nrow(x))
nser <- as.integer(ncol(x))
if (is.na(vector_length) || is.na(nser))
stop("the length of the vectors must be greater than 1")
# check the lag.max value is set appropriately
lag.max = as.integer(min(lag.max, (vector_length - 20), (vector_length/ 2)))
#take the smaller value of either number of values in vector (minus 20), the number of values divided by 2 or the lag.max value; so if you put in a huge value it will still only use the correct max number (based on the vector)
if (is.na(lag.max) || lag.max < 4)
stop("'lag.max' must be at least 5.")
# calculate confidence interval (for samples only):
if (level == 'sample'){
lower_bound = cor.test(x,y)$conf.int[1]
upper_bound = cor.test(x,y)$conf.int[2]
conf = data.frame('lower_bound' = lower_bound,
'upper_bound' = upper_bound)
} else {
conf = 'confidence intervals are not reported for populations'
}
## calculate the cross-correlations
# the output is the 'result' df, which contains all of the cross-correlation values
# the 'lag' dg contains the lag timeframes (these correspond to the output in the 'result' df)
# wrap == F
# wrap==F means that there are NO periodic boundary conditions
if (wrap == F){
result_pos = rep(NA, lag.max+1) # add 1 as we start from / account for a lag of 0 in the for loop below
result_neg = rep(NA, lag.max)
if (level == 'population'){
# for population level correlation, use the cor_pop function
for (j in seq(0, lag.max)){
dist = length(y)-j
# result_pos[[j+1]] = cor_pop(x[(1+j):length(x)], y[1:dist])
result_pos[[j+1]] = cor_pop(x[1:dist], y[(1+j):length(y)])}
# for the negative values (e.g. a lag of -1 .. -lag.max), I am lagging the x values against the y -> this makes the most sense to me
# by keeping y unlagged and moving x 'forward', we get the effect of lagging x backwards.
# I foud this hard to get my head round initially
# to test, write out on paper an x variable going from A to E and a y variable going from F to J + manually 'lag' them.
for (j in seq(1, lag.max)){
dist = length(y)-j
result_neg[[j]] = cor_pop(x[(1+j):length(x)], y[1:dist])}
}
if (level == 'sample') {
# for sample level correlation, use the cor_sample function
for (j in seq(0, lag.max)){
dist = length(y)-j
result_pos[[j+1]] = cor_sample(x[1:dist], y[(1+j):length(y)])}
for (j in seq(1, lag.max)){
dist = length(y)-j
result_neg[[j]] = cor_sample(x[(1+j):length(x)], y[1:dist])}
}
## Combine the values for negative and positive lags:
result = c(result_neg, result_pos)
#print(result)
names(result) <- paste0("lag", seq.int(-length(result_neg), lag.max))
result = as.matrix(result)
# create and format the lag matrix:
lag = as.matrix(seq.int(-length(result_neg), lag.max) )
dimnames(lag) = dimnames(result)
## Calculate which lag value produces the best fit:
# First, we need to make sure that we take the right value - for ACF we want the second best value (as a lag of 0 will always be 1, i.e. the best - which we don't want returned as that isn't very interesting)
if (type == 'CCF'){index = 1}
if (type == 'ACF'){index = 2}
# calculate the best correlation
result_df = as.data.frame(result)
cor_value = result_df$V1[order(abs(result_df$V1),decreasing=T)[index]] # uses the abs value -> the value with the greatest magnitude. It should therefore return highly correlated negative values too.
# calculate the best row position (as we now have negative and positive lags, this is not the same as the best lag value)
row_index = order(abs(result_df$V1),decreasing=T)[index]
row_index = as.integer(row_index)
# calculate the best lag
lag_best = row.names(lag)[row_index]
# combine the results
#best = c(lag_best, cor_value, row_lag)
best = data.frame('lag_best' = lag_best,
'cor_value' = cor_value,
'row_index' = row_index)
}
# wrap == T
# if wrap == T it means that there are periodic boundary conditions i.e. the pattern is self-looping.
if (wrap == T){
# create a variable for the periodic boundary length (i.e. the length of the vector)
bound = length(y) #wrap=T doesn't use lag.max as it just uses the full length of y to lag on (so, it is much easier to set this value).
# create an empty df ('result_pos' and 'result_neg') to fill with values:
result_pos = rep(NA, bound) # do NOT add 1 as in this instance, 1000 is the same as 0 (because it is periodic!)
# note that we cannot have negative values for a looping pattern; we just 'go round' back to 1000 for the plotting.
# for 'negative' values, we should just swap the y with the x when we input it.
## Correlate the wrapped around values, depending on whether level is 'sample' or 'population':
if (level == 'population'){
# calculate the positive lagged value
for (j in seq(0, (bound-1))){ #because it starts at 0, we only need to go to bound-1 -> the full range of 'bound' values will still be covered
dist = bound-j
# the formula does not work for j==0 as it adds on extra values, making vectors that are 1001 and 1002 in length
# so, where j==0 we just set x1 to x and y1 to y (that is, fortunately, where there is no lag we can just use the variables as they are; we don't need to 'wrap them round'.)
if (j == 0){
x1 = x
y1 = y
} else{
x1 = x # 'x' just stays stationary and is always the same; it always covers the full range of x values.
#y1 = c(y[1:dist], y[(dist+1): boun values.
y1 = y[c((1+j):bound, 1:j)]} # this 'wraps around', by concatenating the lag values with the earlier values. }
# calculate the correlation values using cor_pop:
result_pos[[j+1]] = cor_pop(x1,y1)}}
if (level == 'sample'){
# calculate the positive lagged value
for (j in seq(0, (bound-1))){ #because it starts at 0, we only need to go to bound-1 -> the full range of 'bound' values will still be covered
dist = bound-j
# the formula does not work for j==0 as it adds on extra values, making vectors that are 1001 and 1002 in length
# so, where j==0 we just set x1 to x and y1 to y (that is, fortunately, where there is no lag we can just use the variables as they are; we don't need to 'wrap them round'.)
if (j == 0){
x1 = x
y1 = y
} else{
x1 = x[1: 1000] # 'x' just stays stationary and is always the same; it always covers the full range of x values.
#y1 = c(y[1:dist], y[(dist+1): boun values.
y1 = y[c((1+j):bound, 1:j)]} # this 'wraps around', by concatenating the lag values with the earlier values. }
# calculate the correlation values using cor_sample:
result_pos[[j+1]] = cor_sample(x1,y1)}}
# end of correlation calculations for both negative and positive values (for either samples or populations)
## Combine the values for negative and positive lags:
# result = c(result_neg, result_pos)
result = result_pos
names(result) <- paste0("lag", seq.int(0, length(result)-1))
result = as.matrix(result)
# create and format the lag matrix:
lag = as.matrix(seq(1,length(result)))
#lag = as.matrix(seq.int(-length(result_neg), length(result_pos)) )
dimnames(lag) = dimnames(result)
## Calculate which lag value produces the best fit:
# It is more efficient to do this before we repeat the values 10 times.
# First, need to make sure that we take the right value - for ACF we want the second best value (as a lag of 0 will always be 1, i.e. the best - which we don't want returned as that isn't very interesting)
if (type == 'CCF'){index = 1}
if (type == 'ACF'){index = 2}
# this code takes the abs value -> the value with the greatest magnitude. It should therefore return highly correlated negative values too.
# calculate the best correlation
result_df = as.data.frame(result)
cor_value = result_df$V1[order(abs(result_df$V1),decreasing=T)[index]]
# calculate the best row position (as we now have negative and positive lags, this is not the same as the best lag value)
row_index = order(abs(result_df$V1),decreasing=T)[index]
row_index = as.integer(row_index)
# calculate the best lag
lag_best = row.names(lag)[row_index]
# combine the results
best = data.frame('lag_best' = lag_best,
'cor_value' = cor_value,
'row_index' = row_index)
## Finally, make 'result' extra long (repeating 10 times) so multiple plots of the pattern can be produced
result = as.matrix( rep(result,10) )
}
# end of main code: for both sample=T and population=T the correlation coefficients for lags have been calculated
## Final code
# create some additional variables, which can then be passed to the other functions in the package
# wrap
if (wrap == T){ wrap1 = 'wrapped' }
if (wrap == F){ wrap1 = 'no wrap' }
# type
# No need - can just pass the 'type' object directly to the output.
# prepare the final output
cor.out <- structure(list(cor = result,
lag = lag,
fit_best = best,
x = x,
y = y,
wrap = wrap1,
type = type,
level = level,
conf.int = conf),
class = "cor.out")
return(cor.out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.