In Faskally/rtemp: functions to compare river data

Loch Ard River Temperature:

Functional Data Analysis approach

Session set up

# load required libraries
library(lattice)
library(fda)
library(reshape)

Loading in data

# get full data set
load("B:/Loch_Ard_Felling/Package/rivertemp/data/fullwater.rda")
# a quick fix
stream $ year[stream $ Sensor.Model == "Squirrel"] <- stream $ year[stream $ Sensor.Model == "Squirrel"] + 1900 
head(stream)

investigate daily temperature patterns for burn 2

# select a few days data and add in decimal hour
dat <- with(subset(stream, site == 2 & year == 2009), 
    data.frame(temp = Original.Value, 
               dhour = hour + min/60, 
               month = factor(month.abb[mon + 1], levels = month.abb), 
               yday = yday, 
               year = year, 
               site = site))
head(dat)

xyplot(temp ~ dhour | month, groups = yday, data = dat, type = "l", as.table = TRUE)

Lets build some daily temperature curves. The idea is to use functional data anaysis techniques to summarise the daily data into a few statistics using functional PCA. These daily summaries will then be used construct yearly curves for further modelling. It may be nesisary to remove some days as outliers before proceeding to the modelling phase.

The next bit of code builds daily temperature curves selecting the roughness penalty using GCV. A different roughness penalty is chosen for each month

first job though is to set up the smoothing bases and a penalty

# set up a bspline basis of dimension 48 and order 6 
hourbasis <- create.bspline.basis(c(0, 24), 48, 6)
# and a harmonic accelerator penalty... penalises deviations from sinusiodal curves
penalty <- vec2Lfd(c(0, (2*pi/24)^2, 0), c(0, 24))

# loop over days and fit curves

getgcv <- function(lambda, .yday, .year = 2009, .site = 2) {
  sdat <- subset(dat, yday == .yday & year == .year & site == .site)
  if (nrow(sdat) < 23) return (NA)
  smooth.basis(sdat $ dhour, sdat $ temp, fdPar(hourbasis, penalty, lambda)) $ gcv
}

getmonthlygcv <- function(lambda, .month, .year = 2009, .site = 2) {
  ydays <- unique(subset(dat, month == .month) $ yday)
  sum(sapply(ydays, getgcv, lambda = lambda, .year = .year, .site = .site), na.rm = TRUE)
}

optimfn <- function(par, .month) getmonthlygcv(exp(par), .month = .month)

lambdas <- data.frame(month = factor(month.abb, levels = month.abb))
lambdas $ loglambdas <- sapply(lambdas $ month, function(.month) optimise(optimfn, interval = c(-15, 15), .month = .month) $ minimum)
lambdas

The smoothing parameters look fairly constant in time so lets use just one.

getcoef <- function(lambda, .yday, .year = 2009, .site = 2) {
  sdat <- subset(dat, yday == .yday & year == .year & site == .site)
  if (nrow(sdat) < 23) return (NULL)
  c(coef(smooth.basis(sdat $ dhour, sdat $ temp, fdPar(hourbasis, penalty, lambda))))
}

ck <- sapply(0:365, getcoef, lambda = exp(-8), .year = 2009, .site = 2)
ck <- simplify2array(ck[!sapply(ck, is.null)])

tempfd <- fd(ck, hourbasis)

capt <- plot(tempfd, col = rainbow(365), lty = 1)

Now we have smoothed daily curves. What do we want to do with these... We could look at where the biggest variability is within the day?

hour <- seq(0, 24, length = 100)
varmat <- eval.bifd(hour, hour, var.fd(tempfd))
image(hour, hour, varmat, col = heat.colors(100))
contour(hour, hour, varmat, add = TRUE)

So overall the most variability is between 3pm and 6pm. But what about at different points in the year. The following plot looks at the variance curves for each month to see if there is a change in where the variability lies.

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
stdmat <- sapply(ydays, function(i) eval.fd(hour, std.fd(tempfd[i])))
stddat <- expand.grid(hour = hour, month = month.abb)
stddat $ std <- c(stdmat)
xyplot(std^2 ~ hour | month, data = stddat, type = "l", as.table = TRUE, grid = TRUE)

lets to a quick PCA to see what the major components of variation are

pc <- pca.fd(tempfd, nharm = 4)
par(mfrow = c(2,2))
plot(pc)

and the harmonics are

plot(pc $ harmonics)

The first harmonic is effectively the daily mean temperature. It would perhaps be a good idea to remove this variation simply by centering the data around the daily means, and then examining the remaining variations.

basis1 <- create.constant.basis(c(0, 24))

getcoef <- function(.yday, .year = 2009, .site = 2) {
  sdat <- subset(dat, yday == .yday & year == .year & site == .site)
  if (nrow(sdat) < 23) return (NULL)
  c(coef(smooth.basis(sdat $ dhour, sdat $ temp, basis1)))
}

ck <- sapply(0:365, getcoef, .year = 2009, .site = 2)
ck <- simplify2array(ck[!sapply(ck, is.null)])

dailymeanfd <- fd(matrix(ck, 1), basis1)

So now we have a functional data object of all the daily means. We can substract this from the daily temperature curves and investigate the remaining variation

centeredfd <- tempfd - dailymeanfd
plot(centeredfd, col = rainbow(365), lty = 1)

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
stdmat <- eval.fd(hour, centeredfd)
stddat <- expand.grid(hour = hour, yday = unique(dat $ yday))
stddat $ std <- c(stdmat)
stddat $ month <- sapply(stddat $ yday, function(x) which(sapply(ydays, function(y) x %in% y)))
stddat $ month <- factor(month.abb[stddat $ month], levels = month.abb)
xyplot(std ~ hour | month, groups = yday, data = stddat, type = "l", as.table = TRUE, grid = TRUE)

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
stdmat <- sapply(ydays, function(i) eval.fd(hour, std.fd(centeredfd[i])))
stddat <- expand.grid(hour = hour, month = month.abb)
stddat $ std <- c(stdmat)
xyplot(std^2 ~ hour | month, data = stddat, type = "l", as.table = TRUE, grid = TRUE)

lets redo the PCA to see what the major components of variation now are

pc2 <- pca.fd(centeredfd, nharm = 4)
par(mfrow = c(2,2))
plot(pc2)

and the harmonics are

plot(pc2 $ harmonics)

investigate daily temperature patterns for burn 10

# select a few days data and add in decimal hour
dat10 <- with(subset(stream, site == 10 & year == 2009), 
    data.frame(temp = Original.Value, 
               dhour = hour + min/60, 
               month = factor(month.abb[mon + 1], levels = month.abb), 
               yday = yday, 
               year = year, 
               site = site))
head(dat10)

xyplot(temp ~ dhour | month, groups = yday, data = dat10, type = "l", as.table = TRUE)

# loop over days and fit curves

getgcv <- function(lambda, .yday, .year = 2009, .site = 10) {
  sdat <- subset(dat10, yday == .yday & year == .year & site == .site)
  if (nrow(sdat) < 23) return (NA)
  smooth.basis(sdat $ dhour, sdat $ temp, fdPar(hourbasis, penalty, lambda)) $ gcv
}

getmonthlygcv <- function(lambda, .month, .year = 2009, .site = 10) {
  ydays <- unique(subset(dat10, month == .month) $ yday)
  sum(sapply(ydays, getgcv, lambda = lambda, .year = .year, .site = .site), na.rm = TRUE)
}

optimfn <- function(par, .month) getmonthlygcv(exp(par), .month = .month)

lambdas <- data.frame(month = factor(month.abb, levels = month.abb))
lambdas $ loglambdas <- sapply(lambdas $ month, function(.month) optimise(optimfn, interval = c(-15, 15), .month = .month) $ minimum)
lambdas

The smoothing parameters look fairly constant in time so lets use just one.

getcoef <- function(lambda, .yday, .year = 2009, .site = 10) {
  sdat <- subset(dat10, yday == .yday & year == .year & site == .site)
  if (nrow(sdat) < 23) return (NULL)
  c(coef(smooth.basis(sdat $ dhour, sdat $ temp, fdPar(hourbasis, penalty, lambda))))
}

ck <- sapply(0:365, getcoef, lambda = exp(-8), .year = 2009, .site = 10)
ck <- simplify2array(ck[!sapply(ck, is.null)])

tempfd10 <- fd(ck, hourbasis)

capt <- plot(tempfd10, col = rainbow(365), lty = 1)

basis1 <- create.constant.basis(c(0, 24))

getcoef <- function(.yday, .year = 2009, .site = 10) {
  sdat <- subset(dat10, yday == .yday & year == .year & site == .site)
  if (nrow(sdat) < 23) return (NULL)
  c(coef(smooth.basis(sdat $ dhour, sdat $ temp, basis1)))
}

ck <- sapply(0:365, getcoef)
ck <- simplify2array(ck[!sapply(ck, is.null)])

dailymeanfd10 <- fd(matrix(ck, 1), basis1)

So now we have a functional data object of all the daily means. We can substract this from the daily temperature curves and investigate the remaining variation

centeredfd10 <- tempfd10 - dailymeanfd10
plot(centeredfd10, col = rainbow(365), lty = 1)

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
stdmat <- eval.fd(hour, centeredfd10)
stddat <- expand.grid(hour = hour, yday = unique(dat10 $ yday))
stddat $ month <- sapply(stddat $ yday, function(x) which(sapply(ydays, function(y) x %in% y)))
stddat $ month <- factor(month.abb[stddat $ month], levels = month.abb)
stddat $ std <- c(stdmat)
xyplot(std ~ hour | month, groups = yday, data = stddat, type = "l", as.table = TRUE, grid = TRUE)

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
stdmat <- sapply(ydays, function(i) eval.fd(hour, std.fd(centeredfd10[i])))
stddat <- expand.grid(hour = hour, month = month.abb)
stddat $ std <- c(stdmat)
xyplot(std^2 ~ hour | month, data = stddat, type = "l", as.table = TRUE, grid = TRUE)

lets redo the PCA to see what the major components of variation now are

pc10.2 <- pca.fd(centeredfd10, nharm = 4)
par(mfrow = c(2,2))
plot(pc10.2)

and the harmonics are

plot(pc10.2 $ harmonics)

what about cross correlations?

lets remind ourselves of the within site covariances

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(tempfd[i])))
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 2")

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(tempfd10[i])))
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 10")

The same scaled

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(tempfd[i])))
scl <- apply(abs(varmat), 2, max)
varmat <- sweep(varmat, 2, scl, "/")
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 2")

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(tempfd10[i])))
scl <- apply(abs(varmat), 2, max)
varmat <- sweep(varmat, 2, scl, "/")
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 10")

Now with the daily mean removed

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(centeredfd[i])))
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 2")

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(centeredfd10[i])))
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 10")

The same scaled

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(centeredfd[i])))
scl <- apply(abs(varmat), 2, max)
varmat <- sweep(varmat, 2, scl, "/")
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 2")

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(centeredfd10[i])))
scl <- apply(abs(varmat), 2, max)
varmat <- sweep(varmat, 2, scl, "/")
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE, main = "Burn 10")

hour <- seq(0, 24, length = 100)
varmat <- eval.bifd(hour, hour, var.fd(tempfd, tempfd10))
image(hour, hour, varmat, col = heat.colors(100), xlab = "time of day at burn 2", ylab = "time of day at burn 10")
contour(hour, hour, varmat, add = TRUE)
abline(v = seq(0, 24, by = 3), h = seq(0, 24, by = 3), lty = 2, col = "#00000033")
abline(a = 0, b = 1, lty = 1, col = "#00000022")

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(tempfd[i], tempfd10[i])))
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = month.abb)
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE)

with each month rescaled to see detail:

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(tempfd[i], tempfd10[i])))
scl <- apply(abs(varmat), 2, max)
varmat <- sweep(varmat, 2, scl, "/")
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE)

Now for the same after removing daily mean variation

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(centeredfd[i], centeredfd10[i])))
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = month.abb)
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE)

The next plot is scaled covariance to see the detail by month since the summer months swamp the covariance between the sites

hour <- seq(0, 24, length = 100)
ydays <- lapply(month.abb, function(x) unique(subset(dat10, month == x) $ yday))
varmat <- sapply(ydays, function(i) eval.bifd(hour, hour, var.fd(centeredfd[i], centeredfd10[i])))
scl <- apply(abs(varmat), 2, max)
varmat <- sweep(varmat, 2, scl, "/")
vardat <- expand.grid(hour2 = hour, hour10 = hour, month = paste(month.abb, round(scl, 2)))
vardat $ covar <- c(varmat)
levelplot(covar ~ hour2 * hour10 | month, data = vardat, col.regions = heat.colors(100), contour = TRUE, as.table = TRUE)

So from the crosscorrelation plots (this will be clearer when we put in feint diagonal lines) that the peak of covariance is slightly off diagonal indicating that the daily curves have maximum variance at slightly different time of day. THis is most common in the afternoon peak, than in the morning peak (associated with the morning low) which tends to be more consistent between sites accross days.