#' Characterize land surface phenology using spectral vegetation index time series
#'
#' @description
#' This function characterizes seasonal land surface phenology at each sample site using
#' flexible cubic splines that are iteratively fit to time series of spectral vegetation indices
#' (e.g., NDVI). This function facilitates estimating annual maximum NDVI and other
#' spectral vegetation indices with lsat_summarize_growing_seasons(). For each site,
#' cubic splines are iteratively fit to measurements pooled over years within a moving
#' window that has a user-specified width. Each cubic spline is iteratively fit, with
#' each iteration checking if there are outliers and, if so, excluding outliers and refitting.
#' The function returns information about typical phenology at a sample site and about
#' the relative phenological timing of each individual measuremenent. This function was
#' designed for situations where the seasonal phenology is hump-shaped. If you are using
#' a spectral index that is typically negative (e.g., Normalized Difference Water Index)
#' then multiply the index by -1 before running this function, then back-transform
#' your index after running the lsat_summarize_growing_seasons() function.
#'
#' @param dt Data.table with a multi-year time series a vegetation index.
#' @param si Character string specifying the spectral index (e.g., NDVI) to use for
#' determining surface phenology. This must correspond to an existing column
#' in the data.table.
#' @param window.yrs Number specifying the focal window width in years that is used when
#' pooling data to fit cubic splines (use odd numbers).
#' @param window.min.obs Minimum number of focal window observations necessary to fit
#' a cubic spline.
#' @param si.min Minimum value of spectral index necessary for observation to be used
#' when fitting cubic splines. Defaults to 0.15 which for NDVI is about when plants
#' are present. Note that si.min must be >= 0 because the underlying spline fitting
#' function will error out if provided negative values.
#' @param spar Smoothing parameter typically around 0.70 - 0.80 for this application.
#' A higher value means a less flexible spline. Defaults to 0.78.
#' @param pcnt.dif.thresh Vector with two numbers that specify the allowable
#' negative and positive percent difference between individual
#' observations and fitted cubic spline. Observations that differ by more than
#' these thresholds are filtered out and the cubic spline is iteratively refit.
#' Defaults to -30% and 30%.
#' @param weight When fitting the cubic splines, should individual observations be
#' weighted by their year of acquisition relative to the focal year?
#' If so, each observation is weighted by exp(-0.25*n.yrs.from.focal) when fitting the cubic splines.
#' @param spl.fit.outfile (Optional) Name of output csv file containing the fitted
#' cubic splines for each sample site. Useful for subsequent visualization.
#' @param progress (TRUE/FALSE) Print a progress report?
#' @param test.run (TRUE/FALSE) If TRUE, then algorithm is run using a small random
#' subset of data and only a figure is output. This is used for model parameterization.
#' @param plot.title (Optional) Custom title for output figure.
#' @return Data.table that provides, for each observation, information on the phenological
#' conditions for that specific day of year during the focal period.
#' These data can then be used to estimate annual maximum spectral index
#' and other growing season metrics using lsat_summarize_growing_season().
#' A figure is also generated that shows observation points and phenological
#' curves for nine random sample locations.
#' @import data.table
#' @export lsat_fit_phenological_curves
#'
#' @examples
#' data(lsat.example.dt)
#' lsat.dt <- lsat_format_data(lsat.example.dt)
#' lsat.dt <- lsat_clean_data(lsat.dt)
#' lsat.dt <- lsat_calc_spectral_index(lsat.dt, 'ndvi')
#' lsat.dt <- lsat_calibrate_rf(lsat.dt, band.or.si = 'ndvi',
#' write.output = FALSE, train.with.highlat.data = TRUE)
#' lsat.pheno.dt <- lsat_fit_phenological_curves(lsat.dt, si = 'ndvi')
#' lsat.pheno.dt
lsat_fit_phenological_curves = function(dt,
si,
window.yrs=7,
window.min.obs=20,
si.min=0.15,
spar=0.78,
pcnt.dif.thresh=c(-30,30),
weight=TRUE,
spl.fit.outfile=FALSE,
progress=TRUE,
test.run=FALSE,
plot.title=NA){
dt <- data.table::data.table(dt)
dt[, obs.id := 1:nrow(dt)]
obs.filtered <- c()
# (OPTIONAL) SUBSAMPLE SITES IF RUNNING IN TEST MODE
n.sites <- length(unique(dt$sample.id))
if (test.run == TRUE){
if (n.sites < 9){
stop(paste0('Your data set only has ', n.sites, ' sample sites, so do not run in test mode'))
} else {
dt <- dt[sample.id %in% sample(unique(dt$sample.id), 9)]
}
}
# GET SAMPLE ID, DOY, YEAR, AND SPECTRAL INDEX FROM INPUT DATA TABLE
dt <- dt[, eval(c('sample.id','obs.id','latitude','longitude','year','doy',si)), with=F]
dt <- data.table::setnames(dt, si, 'si')
dt <- dt[order(sample.id,doy)]
# FILTER OUT LOW VALUES
dt <- dt[si >= si.min]
# INITIAL OUTLIER EXCLUSION BY FITTING SPLINE ACROSS ALL YEARS
dt[, n.obs := .N, by = 'sample.id']
dt <- dt[n.obs >= window.min.obs]
dt <- dt[, n.obs := NULL]
rough.splines.dt <- dt[, .(spl.fit = list(stats::smooth.spline(doy, si, spar = spar))), by = 'sample.id']
doy.rng <- min(dt$doy):max(dt$doy)
rough.spline.fits.dt <- rough.splines.dt[, .(spl.fit = unlist(Map(function(mod,doy){stats::predict(mod, data.frame(doy=doy.rng))$y}, spl.fit))), by = 'sample.id']
rough.spline.fits.dt <- rough.spline.fits.dt[, doy := doy.rng, by = 'sample.id']
dt <- rough.spline.fits.dt[dt, on = c('sample.id','doy')]
dt <- dt[, pcnt.dif := (si - spl.fit)/((si+spl.fit)/2)*100]
dt <- dt[pcnt.dif > -100 & pcnt.dif < 100]
dt <- dt[, c('spl.fit', 'pcnt.dif'):= NULL]
rm(rough.spline.fits.dt)
rm(rough.splines.dt)
# IDENTIFY TIME PERIODS
all.yrs <- sort(unique(dt$year))
rng.yrs <- min(all.yrs):max(all.yrs)
focal.yrs <- rng.yrs[-c(1:(round(window.yrs/2)), (length(rng.yrs)-round(window.yrs/2)+1):length(rng.yrs))]
n.focal.yrs <- length(focal.yrs)
# CREATE LISTS FOR TEMPORARY STORAGE
data.list <- list()
splines.list <- list()
# LOOP THROUGH FOCAL YEARS, FITTING SPLINE FOR EACH SAMPLE SITE
all.yrs <- min(dt$year):max(dt$year)
for (i in all.yrs){
focal.yr <- i
# DETERMINE FOCAL WINDOW
half.win.yrs <- round(window.yrs/2)
if (i - half.win.yrs < min(all.yrs)){ # start of record
focal.win <- min(all.yrs):(min(all.yrs)+window.yrs-1)
} else if (i + half.win.yrs > max(all.yrs)){ # end of record
focal.win <- (max(all.yrs)-window.yrs+1):max(all.yrs)
} else{ # middle of record
focal.win <- (i - half.win.yrs):(i + half.win.yrs)
}
# SUBSET OBS FROM FOCAL PERIOD
focal.dt <- dt[year %in% focal.win]
focal.dt <- focal.dt[order(sample.id,doy)]
# COMPUTE NUMBER OF OBS DURING FOCAL PERIOD FOR EACH SAMPLE SITE AND EXCLUDE SITES WITH FEWER THAN SOME USE-SPECIFIC THRESHOLD
focal.dt <- focal.dt[, n.obs.focal.win := .N, by = 'sample.id']
focal.dt <- focal.dt[n.obs.focal.win >= window.min.obs]
# IF NO SITES MEET THE OBSERVATION CRITERIA, THEN SKIP TO THE NEXT YEAR
if (nrow(focal.dt) == 0){
if (progress == T){print(paste0('skipping ', focal.yr,' because too little data'))}
next()
}
# FIT SPLINE TO SEASONAL TIME SERIES AT EACH SAMPLE SITE
if (weight == T){
focal.dt[, n.yrs.from.focal := abs(year - focal.yr)]
focal.dt[, weight := exp(-0.25*n.yrs.from.focal)]
splines.dt <- focal.dt[, .(spl.fit = list(stats::smooth.spline(doy, si, w = weight, spar = spar))), by = 'sample.id']
} else {
splines.dt <- focal.dt[, .(spl.fit = list(stats::smooth.spline(doy, si, spar = spar))), by = 'sample.id']
}
doy.rng <- min(focal.dt$doy):max(focal.dt$doy)
spline.fits.dt <- splines.dt[, .(spl.fit = unlist(Map(function(mod,doy){stats::predict(mod, data.frame(doy=doy.rng))$y}, spl.fit))), by = 'sample.id']
spline.fits.dt <- spline.fits.dt[, doy := doy.rng, by = 'sample.id']
# ITERATIVE QUALITY CHECK: LOOK FOR LARGE DIFFS BETWEEN OBS AND FITTED VALUES, DROP OBS WITH TOO LARGE A DIFF, REFIT SPLINES, RECHECK
refitting = 1
# ii=1
while(refitting == 1){
focal.dt <- spline.fits.dt[focal.dt, on = c('sample.id','doy')]
focal.dt <- focal.dt[, pcnt.dif := (si - spl.fit)/((si+spl.fit)/2)*100] # calc abs % dif
refit.sites <- unique(focal.dt[pcnt.dif < pcnt.dif.thresh[1] | pcnt.dif > pcnt.dif.thresh[2]]$sample.id)
obs.filtered <- unique(c(obs.filtered, unique(focal.dt[pcnt.dif < pcnt.dif.thresh[1] | pcnt.dif > pcnt.dif.thresh[2]]$obs.id)))
focal.dt <- focal.dt[pcnt.dif > pcnt.dif.thresh[1] & pcnt.dif < pcnt.dif.thresh[2]]
focal.dt <- focal.dt[, c('spl.fit', 'pcnt.dif'):= NULL]
refit.dt <- focal.dt[sample.id %in% refit.sites]
# REFIT SPLINES AT SITES THAT HAD LARGE DIFFS BETWEEN OBS AND FITTED VALUES
if (length(refit.sites) > 0){
# set aside the splines that don't need to be refit
spline.fits.dt <- spline.fits.dt[sample.id %in% refit.sites == F]
# check the number of obs per site and filter sites out those with too few obs
refit.dt <- refit.dt[, n.obs.focal.win := .N, by = 'sample.id']
refit.dt <- refit.dt[n.obs.focal.win >= window.min.obs]
# if no data are left, then stop refitting...
if (nrow(refit.dt) == 0){
refitting = 0
} else{
# refit
if (weight == T){
spline.refits.dt <- refit.dt[, .(spl.fit = list(stats::smooth.spline(doy, si, w = weight, spar = spar))), by = 'sample.id']
} else {
spline.refits.dt <- refit.dt[, .(spl.fit = list(stats::smooth.spline(doy, si, spar = spar))), by = 'sample.id']
}
spline.refits.dt <- spline.refits.dt[, .(spl.fit = unlist(Map(function(mod,doy){stats::predict(mod, data.frame(doy=doy.rng))$y}, spl.fit))), by = 'sample.id']
spline.refits.dt <- spline.refits.dt[, doy := doy.rng, by = 'sample.id']
spline.fits.dt <- rbind(spline.fits.dt, spline.refits.dt)
}
} else {
refitting = 0
}
} # end of refitting
# DROP WEIGHTING COLUMNS
if (weight == T){
focal.dt <- focal.dt[, c('n.yrs.from.focal','weight') := NULL]
}
# CALCULATE SEVERAL PHENOLOGY METRICS FOR EACH SAMPLE SITE
sample.doy.smry <- focal.dt[, .(min.doy = min(doy), max.doy = max(doy)), by = 'sample.id'] # identify DOY range for each site
spline.fits.dt <- spline.fits.dt[sample.doy.smry, on = 'sample.id']
spline.fits.dt <- spline.fits.dt[doy >= min.doy][doy <= max.doy] # limit spline fit to DOY range at each site
spline.fits.dt <- spline.fits.dt[, spl.fit.max := max(spl.fit), by = 'sample.id'] # compute max si typically observed at a site
spline.fits.dt <- spline.fits.dt[, spl.fit.max.doy := doy[which.max(spl.fit)], by = 'sample.id'] # calculate typical day of peak greenness
spline.fits.dt <- spline.fits.dt[, spl.frac.max := spl.fit / spl.fit.max]
spline.fits.dt <- spline.fits.dt[, si.adjustment := spl.fit.max - spl.fit] # compute adjustment factor
spline.fits.dt <- spline.fits.dt[, focal.yr := focal.yr]
spline.fits.dt <- spline.fits.dt[, c('min.doy','max.doy'):= NULL]
splines.list[[i]] <- spline.fits.dt
# ADD PHENOLOGY METRICS TO FOCAL DATA
focal.dt <- spline.fits.dt[focal.dt, on = c('sample.id','doy')]
focal.dt <- focal.dt[, si.max.pred := si + si.adjustment]
focal.dt <- focal.dt[, c('focal.yr') := NULL]
focal.dt <- focal.dt[year == focal.yr]
setnames(focal.dt, c('n.obs.focal.win'),c('spl.n.obs'))
# ADD PHENOLOGY DATA TO MAIN DATA TABLE
data.list[[i]] <- focal.dt
# PRINT STATUS (if requested)
if (progress == TRUE){print(paste0('finished ', focal.yr))}
} # end focal year loop
# WRITE OUT FILE WITH SPLINE FITS (OPTIONAL)
spline.dt <- data.table::data.table(data.table::rbindlist(splines.list))
if (spl.fit.outfile != FALSE){
data.table::fwrite(spline.dt, spl.fit.outfile)
}
# CREATE OUTPUT DATA TABLE BY COMBINING FOCAL YEAR DATA
dt <- data.table::data.table(rbindlist(data.list))
dt <- dt[order(sample.id,year,doy)]
data.table::setcolorder(dt, c('sample.id','latitude','longitude','year','doy',
'si','spl.n.obs','spl.fit','spl.frac.max',
'spl.fit.max','spl.fit.max.doy','si.adjustment',
'si.max.pred'))
# OUTPUT FIGURE
n.sites <- length(unique(dt$sample.id))
if (n.sites > 9){
example.ids <- sample(unique(dt$sample.id), 9, replace = F)
} else {
example.ids <- sample(unique(dt$sample.id), n.sites, replace = F)
}
example.obs.dt <- dt[sample.id %in% example.ids]
example.curves.dt <- spline.dt[sample.id %in% example.ids]
if(is.na(plot.title) & n.sites > 9){
plot.title <- 'Phenological curve fits for nine random sample locations'
} else if (is.na(plot.title) & n.sites <= 9) {
plot.title = 'Phenological curve fits'
}
fig <- ggplot2::ggplot(example.obs.dt, ggplot2::aes(doy, si)) +
ggplot2::labs(y=paste0('Landsat ', toupper(si)), x='Day of Year') +
ggplot2::ggtitle(plot.title) +
ggplot2::facet_wrap(~sample.id, nrow = 3, ncol = 3, scales = 'free_y') +
ggplot2::geom_point(ggplot2::aes(fill = year), pch=21, color = 'black', size = 2) +
ggplot2::scale_fill_gradientn(name = 'Observation', colours = c('blue','red','gold')) +
ggplot2::geom_line(data = example.curves.dt, mapping = ggplot2::aes(doy, spl.fit, group = focal.yr, color = focal.yr), alpha = 0.75) +
ggplot2::scale_color_gradientn(name = 'Curve', colours = c('blue','red','gold')) +
ggplot2::theme_bw() + ggplot2::theme(legend.position = 'right',
legend.text=ggplot2::element_text(size=12), legend.title=ggplot2::element_text(size=12),
axis.text=ggplot2::element_text(size=12), axis.title=ggplot2::element_text(size=14),
plot.title=ggplot2::element_text(hjust = 0.5),
strip.background = ggplot2::element_rect(fill="black"),
strip.text = ggplot2::element_text(color = "white", size = 12)) +
ggplot2::guides(colour = ggplot2::guide_colourbar(title.position="top", title.hjust = 0.5))
print(fig)
# OUTPUT DATA TABLE
if (test.run == FALSE){
colnames(dt) <- gsub('si',si,colnames(dt))
dt
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.