knitr::opts_chunk$set(echo = TRUE)
This is an R Markdown Notebook to illustrate how to develop pixel-scale spectra-trait PLSR models. This example uses image data from NEON AOP and associated field measurements of leaf nitrogen content collected across a range of CONUS NEON sites. For more information refer to the dataset EcoSIS page: https://ecosis.org/package/canopy-spectra-to-map-foliar-functional-traits-over-neon-domains-in-eastern-united-states
list.of.packages <- c("pls","dplyr","here","plotrix","ggplot2","gridExtra","spectratrait") invisible(lapply(list.of.packages, library, character.only = TRUE))
### Setup options # Script options pls::pls.options(plsralg = "oscorespls") pls::pls.options("plsralg") # Default par options opar <- par(no.readonly = T) # What is the target variable? What is the variable name in the input dataset? inVar <- "Nitrogen" # What is the source dataset from EcoSIS? ecosis_id <- "b9dbf3db-5b9c-4ab2-88c2-26c8b39d0903" # Specify output directory, output_dir # Options: # tempdir - use a OS-specified temporary directory # user defined PATH - e.g. "~/scratch/PLSR" output_dir <- "tempdir"
if (output_dir=="tempdir") { outdir <- tempdir() } else { if (! file.exists(output_dir)) dir.create(output_dir,recursive=TRUE) outdir <- file.path(path.expand(output_dir)) } setwd(outdir) # set working directory print(getwd()) # check wd
print(paste0("Output directory: ",getwd())) # check wd dat_raw <- spectratrait::get_ecosis_data(ecosis_id = ecosis_id) head(dat_raw) names(dat_raw)[1:40]
# identify the trait data and other metadata sample_info <- dat_raw[,names(dat_raw) %notin% seq(300,2600,1)] head(sample_info) # spectra matrix Spectra <- as.matrix(dat_raw[,names(dat_raw) %notin% names(sample_info)]) # set the desired spectra wavelength range to include Start.wave <- 500 End.wave <- 2400 wv <- seq(Start.wave,End.wave,1) final_spec <- Spectra[,round(as.numeric(colnames(Spectra))) %in% wv] colnames(final_spec) <- c(paste0("Wave_",colnames(final_spec))) ## Drop bad spectra data - for canopy-scale reflectance, often the "water band" wavelengths ## are too noisy to use for trait estimation. Its possible to remove these wavelengths ## prior to model fitting. Its best to first identify which wavelengths to drop ## before attempting PLSR, as these ranges may need to be considered on a case-by-case ## basis or generalized for multiple datasets dropwaves <- c(1350:1440, 1826:1946) final_spec <- final_spec[,colnames(final_spec) %notin% paste0("Wave_",dropwaves)] wv <- as.numeric(gsub(pattern = "Wave_",replacement = "", x = colnames(final_spec))) ## Drop bad spectra data - for canopy-scale reflectance, often the "water band" wavelengths ## are too noisy to use for trait estimation. Its possible to remove these wavelengths ## prior to model fitting. Its best to first identify which wavelengths to drop ## before attempting PLSR, as these ranges may need to be considered on a case-by-case ## basis or generalized for multiple datasets dropwaves <- c(1350:1440, 1826:1946) final_spec <- final_spec[,colnames(final_spec) %notin% paste0("Wave_",dropwaves)] wv <- as.numeric(gsub(pattern = "Wave_",replacement = "", x = colnames(final_spec))) # assemble example dataset sample_info2 <- sample_info %>% select(Plot_ID,Sample_Year,SLA,Nitrogen) site_plot <- data.frame(matrix(unlist(strsplit(sample_info2$Plot_ID, "_")), ncol=2, byrow=TRUE)) colnames(site_plot) <- c("Plot_Num","SampleID") sample_info3 <- data.frame(site_plot,sample_info2) plsr_data <- data.frame(sample_info3,final_spec*0.01) rm(sample_info,sample_info2,sample_info3,Spectra, site_plot)
# Example data cleaning. End user needs to do what's appropriate for their # data. This may be an iterative process. # Keep only complete rows of inVar and spec data before fitting # plsr_data <- plsr_data %>% # remove erroneously high values, or "bad spectra" filter(Nitrogen<50) %>% filter(Wave_859<80) %>% filter(Wave_859>15) plsr_data <- plsr_data[complete.cases(plsr_data[,names(plsr_data) %in% c(inVar,paste0("Wave_",wv))]),]
## Make a stratified random sampling in the strata USDA_Species_Code and Domain method <- "base" #base/dplyr # base R - a bit slow # dplyr - much faster split_data <- spectratrait::create_data_split(dataset=plsr_data, approach=method, split_seed=2356326, prop=0.8, group_variables="Plot_Num") names(split_data) cal.plsr.data <- split_data$cal_data head(cal.plsr.data)[1:8] val.plsr.data <- split_data$val_data head(val.plsr.data)[1:8] rm(split_data) # Datasets: print(paste("Cal observations: ",dim(cal.plsr.data)[1],sep="")) print(paste("Val observations: ",dim(val.plsr.data)[1],sep="")) cal_hist_plot <- ggplot(data = cal.plsr.data, aes(x = cal.plsr.data[,paste0(inVar)])) + geom_histogram(fill=I("grey50"),col=I("black"),alpha=I(.7)) + labs(title=paste0("Calibration Histogram for ",inVar), x = paste0(inVar), y = "Count") val_hist_plot <- ggplot(data = val.plsr.data, aes(x = val.plsr.data[,paste0(inVar)])) + geom_histogram(fill=I("grey50"),col=I("black"),alpha=I(.7)) + labs(title=paste0("Validation Histogram for ",inVar), x = paste0(inVar), y = "Count") histograms <- grid.arrange(cal_hist_plot, val_hist_plot, ncol=2) ggsave(filename = file.path(outdir,paste0(inVar,"_Cal_Val_Histograms.png")), plot = histograms, device="png", width = 30, height = 12, units = "cm", dpi = 300) # output cal/val data write.csv(cal.plsr.data,file=file.path(outdir,paste0(inVar,'_Cal_PLSR_Dataset.csv')), row.names=FALSE) write.csv(val.plsr.data,file=file.path(outdir,paste0(inVar,'_Val_PLSR_Dataset.csv')), row.names=FALSE)
cal_spec <- as.matrix(cal.plsr.data[, which(names(cal.plsr.data) %in% paste0("Wave_",wv))]) cal.plsr.data <- data.frame(cal.plsr.data[, which(names(cal.plsr.data) %notin% paste0("Wave_",wv))], Spectra=I(cal_spec)) head(cal.plsr.data)[1:5] val_spec <- as.matrix(val.plsr.data[, which(names(val.plsr.data) %in% paste0("Wave_",wv))]) val.plsr.data <- data.frame(val.plsr.data[, which(names(val.plsr.data) %notin% paste0("Wave_",wv))], Spectra=I(val_spec)) head(val.plsr.data)[1:5]
par(mfrow=c(1,2)) # B, L, T, R spectratrait::f.plot.spec(Z=cal.plsr.data$Spectra,wv=wv,plot_label="Calibration") spectratrait::f.plot.spec(Z=val.plsr.data$Spectra,wv=wv,plot_label="Validation") dev.copy(png,file.path(outdir,paste0(inVar,'_Cal_Val_Spectra.png')), height=2500,width=4900, res=340) dev.off(); par(mfrow=c(1,1))
if(grepl("Windows", sessionInfo()$running)){ pls.options(parallel = NULL) } else { pls.options(parallel = parallel::detectCores()-1) } method <- "pls" #pls, firstPlateau, firstMin random_seed <- 1245565 seg <- 50 maxComps <- 16 iterations <- 80 prop <- 0.70 if (method=="pls") { # pls package approach - faster but estimates more components.... nComps <- spectratrait::find_optimal_components(dataset=cal.plsr.data, targetVariable=inVar, method=method, maxComps=maxComps, seg=seg, random_seed=random_seed) print(paste0("*** Optimal number of components: ", nComps)) } else { nComps <- spectratrait::find_optimal_components(dataset=cal.plsr.data, targetVariable=inVar, method=method, maxComps=maxComps, iterations=iterations, seg=seg, prop=prop, random_seed=random_seed) } dev.copy(png,file.path(outdir,paste0(paste0(inVar,"_PLSR_Component_Selection.png"))), height=2800, width=3400, res=340) dev.off();
plsr.out <- plsr(as.formula(paste(inVar,"~","Spectra")),scale=FALSE,ncomp=nComps,validation="LOO", trace=FALSE,data=cal.plsr.data) fit <- plsr.out$fitted.values[,1,nComps] pls.options(parallel = NULL) # External validation fit stats par(mfrow=c(1,2)) # B, L, T, R pls::RMSEP(plsr.out, newdata = val.plsr.data) plot(pls::RMSEP(plsr.out,estimate=c("test"),newdata = val.plsr.data), main="MODEL RMSEP", xlab="Number of Components",ylab="Model Validation RMSEP",lty=1,col="black",cex=1.5,lwd=2) box(lwd=2.2) R2(plsr.out, newdata = val.plsr.data) plot(pls::R2(plsr.out,estimate=c("test"),newdata = val.plsr.data), main="MODEL R2", xlab="Number of Components",ylab="Model Validation R2",lty=1,col="black",cex=1.5,lwd=2) box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(paste0(inVar,"_Validation_RMSEP_R2_by_Component.png"))), height=2800, width=4800, res=340) dev.off(); par(opar)
#calibration cal.plsr.output <- data.frame(cal.plsr.data[, which(names(cal.plsr.data) %notin% "Spectra")], PLSR_Predicted=fit, PLSR_CV_Predicted=as.vector(plsr.out$validation$pred[,,nComps])) cal.plsr.output <- cal.plsr.output %>% mutate(PLSR_CV_Residuals = PLSR_CV_Predicted-get(inVar)) head(cal.plsr.output) cal.R2 <- round(pls::R2(plsr.out,intercept=F)[[1]][nComps],2) cal.RMSEP <- round(sqrt(mean(cal.plsr.output$PLSR_CV_Residuals^2)),2) val.plsr.output <- data.frame(val.plsr.data[, which(names(val.plsr.data) %notin% "Spectra")], PLSR_Predicted=as.vector(predict(plsr.out, newdata = val.plsr.data, ncomp=nComps, type="response")[,,1])) val.plsr.output <- val.plsr.output %>% mutate(PLSR_Residuals = PLSR_Predicted-get(inVar)) head(val.plsr.output) val.R2 <- round(pls::R2(plsr.out,newdata=val.plsr.data,intercept=F)[[1]][nComps],2) val.RMSEP <- round(sqrt(mean(val.plsr.output$PLSR_Residuals^2)),2) rng_quant <- quantile(cal.plsr.output[,inVar], probs = c(0.001, 0.999)) cal_scatter_plot <- ggplot(cal.plsr.output, aes(x=PLSR_CV_Predicted, y=get(inVar))) + theme_bw() + geom_point() + geom_abline(intercept = 0, slope = 1, color="dark grey", linetype="dashed", linewidth=1.5) + xlim(rng_quant[1], rng_quant[2]) + ylim(rng_quant[1], rng_quant[2]) + labs(x=paste0("Predicted ", paste(inVar), " (units)"), y=paste0("Observed ", paste(inVar), " (units)"), title=paste0("Calibration: ", paste0("Rsq = ", cal.R2), "; ", paste0("RMSEP = ", cal.RMSEP))) + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) cal_resid_histogram <- ggplot(cal.plsr.output, aes(x=PLSR_CV_Residuals)) + geom_histogram(alpha=.5, position="identity") + geom_vline(xintercept = 0, color="black", linetype="dashed", linewidth=1) + theme_bw() + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) rng_quant <- quantile(val.plsr.output[,inVar], probs = c(0.001, 0.999)) val_scatter_plot <- ggplot(val.plsr.output, aes(x=PLSR_Predicted, y=get(inVar))) + theme_bw() + geom_point() + geom_abline(intercept = 0, slope = 1, color="dark grey", linetype="dashed", linewidth=1.5) + xlim(rng_quant[1], rng_quant[2]) + ylim(rng_quant[1], rng_quant[2]) + labs(x=paste0("Predicted ", paste(inVar), " (units)"), y=paste0("Observed ", paste(inVar), " (units)"), title=paste0("Validation: ", paste0("Rsq = ", val.R2), "; ", paste0("RMSEP = ", val.RMSEP))) + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) val_resid_histogram <- ggplot(val.plsr.output, aes(x=PLSR_Residuals)) + geom_histogram(alpha=.5, position="identity") + geom_vline(xintercept = 0, color="black", linetype="dashed", linewidth=1) + theme_bw() + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) # plot cal/val side-by-side scatterplots <- grid.arrange(cal_scatter_plot, val_scatter_plot, cal_resid_histogram, val_resid_histogram, nrow=2,ncol=2) ggsave(filename = file.path(outdir,paste0(inVar,"_Cal_Val_Scatterplots.png")), plot = scatterplots, device="png", width = 32, height = 30, units = "cm", dpi = 300)
vips <- spectratrait::VIP(plsr.out)[nComps,] par(mfrow=c(2,1)) plot(plsr.out$coefficients[,,nComps], x=wv,xlab="Wavelength (nm)", ylab="Regression coefficients",lwd=2,type='l') box(lwd=2.2) plot(wv, vips, xlab="Wavelength (nm)",ylab="VIP",cex=0.01) lines(wv, vips, lwd=3) abline(h=0.8, lty=2, col="dark grey") box(lwd=2.2) dev.copy(png, file.path(outdir, paste0(inVar,'_Coefficient_VIP_plot.png')), height=3100, width=4100, res=340) dev.off(); par(opar)
if(grepl("Windows", sessionInfo()$running)){ pls.options(parallel =NULL) } else { pls.options(parallel = parallel::detectCores()-1) } ### PLSR bootstrap permutation uncertainty analysis iterations <- 500 # how many permutation iterations to run prop <- 0.70 # fraction of training data to keep for each iteration plsr_permutation <- spectratrait::pls_permutation(dataset=cal.plsr.data, targetVariable=inVar, maxComps=nComps, iterations=iterations, prop=prop, verbose = FALSE) bootstrap_intercept <- plsr_permutation$coef_array[1,,nComps] bootstrap_coef <- plsr_permutation$coef_array[2:length(plsr_permutation$coef_array[,1,nComps]), ,nComps] rm(plsr_permutation) # apply coefficients to left-out validation data interval <- c(0.025,0.975) Bootstrap_Pred <- val.plsr.data$Spectra %*% bootstrap_coef + matrix(rep(bootstrap_intercept, length(val.plsr.data[,inVar])), byrow=TRUE, ncol=length(bootstrap_intercept)) Interval_Conf <- apply(X = Bootstrap_Pred, MARGIN = 1, FUN = quantile, probs=c(interval[1], interval[2])) sd_mean <- apply(X = Bootstrap_Pred, MARGIN = 1, FUN = sd) sd_res <- sd(val.plsr.output$PLSR_Residuals) sd_tot <- sqrt(sd_mean^2+sd_res^2) val.plsr.output$LCI <- Interval_Conf[1,] val.plsr.output$UCI <- Interval_Conf[2,] val.plsr.output$LPI <- val.plsr.output$PLSR_Predicted-1.96*sd_tot val.plsr.output$UPI <- val.plsr.output$PLSR_Predicted+1.96*sd_tot head(val.plsr.output)
spectratrait::f.plot.coef(Z = t(bootstrap_coef), wv = wv, plot_label="Bootstrap regression coefficients", position = 'bottomleft') abline(h=0,lty=2,col="grey50") box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(inVar,'_Bootstrap_Regression_Coefficients.png')), height=2100, width=3800, res=340) dev.off();
rmsep_percrmsep <- spectratrait::percent_rmse(plsr_dataset = val.plsr.output, inVar = inVar, residuals = val.plsr.output$PLSR_Residuals, range="full") RMSEP <- rmsep_percrmsep$rmse perc_RMSEP <- rmsep_percrmsep$perc_rmse r2 <- round(pls::R2(plsr.out, newdata = val.plsr.data, intercept=F)$val[nComps],2) expr <- vector("expression", 3) expr[[1]] <- bquote(R^2==.(r2)) expr[[2]] <- bquote(RMSEP==.(round(RMSEP,2))) expr[[3]] <- bquote("%RMSEP"==.(round(perc_RMSEP,2))) rng_vals <- c(min(val.plsr.output$LPI), max(val.plsr.output$UPI)) par(mfrow=c(1,1), mar=c(4.2,5.3,1,0.4), oma=c(0, 0.1, 0, 0.2)) plotrix::plotCI(val.plsr.output$PLSR_Predicted,val.plsr.output[,inVar], li=val.plsr.output$LPI, ui=val.plsr.output$UPI, gap=0.009,sfrac=0.000, lwd=1.6, xlim=c(rng_vals[1], rng_vals[2]), ylim=c(rng_vals[1], rng_vals[2]), err="x", pch=21, col="black", pt.bg=scales::alpha("grey70",0.7), scol="grey80", cex=2, xlab=paste0("Predicted ", paste(inVar), " (units)"), ylab=paste0("Observed ", paste(inVar), " (units)"), cex.axis=1.5,cex.lab=1.8) abline(0,1,lty=2,lw=2) plotrix::plotCI(val.plsr.output$PLSR_Predicted,val.plsr.output[,inVar], li=val.plsr.output$LCI, ui=val.plsr.output$UCI, gap=0.009,sfrac=0.004, lwd=1.6, xlim=c(rng_vals[1], rng_vals[2]), ylim=c(rng_vals[1], rng_vals[2]), err="x", pch=21, col="black", pt.bg=scales::alpha("grey70",0.7), scol="black", cex=2, xlab=paste0("Predicted ", paste(inVar), " (units)"), ylab=paste0("Observed ", paste(inVar), " (units)"), cex.axis=1.5,cex.lab=1.8, add=T) legend("topleft", legend=expr, bty="n", cex=1.5) legend("bottomright", legend=c("Prediction Interval","Confidence Interval"), lty=c(1,1), col = c("grey80","black"), lwd=3, bty="n", cex=1.5) box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(inVar,"_PLSR_Validation_Scatterplot.png")), height=2800, width=3200, res=340) dev.off();
out.jk.coefs <- data.frame(Iteration=seq(1,length(bootstrap_intercept),1), Intercept=bootstrap_intercept,t(bootstrap_coef)) names(out.jk.coefs) <- c("Iteration","Intercept",paste0("Wave_",wv)) head(out.jk.coefs)[1:6] write.csv(out.jk.coefs,file=file.path(outdir,paste0(inVar,'_Bootstrap_PLSR_Coefficients.csv')), row.names=FALSE)
print(paste("Output directory: ", getwd())) # Observed versus predicted write.csv(cal.plsr.output,file=file.path(outdir, paste0(inVar,'_Observed_PLSR_CV_Pred_',nComps, 'comp.csv')),row.names=FALSE) # Validation data write.csv(val.plsr.output,file=file.path(outdir, paste0(inVar,'_Validation_PLSR_Pred_',nComps, 'comp.csv')),row.names=FALSE) # Model coefficients coefs <- coef(plsr.out,ncomp=nComps,intercept=TRUE) write.csv(coefs,file=file.path(outdir,paste0(inVar,'_PLSR_Coefficients_', nComps,'comp.csv')), row.names=TRUE) # PLSR VIP write.csv(vips,file=file.path(outdir,paste0(inVar, '_PLSR_VIPs_',nComps, 'comp.csv')))
print("**** PLSR output files: ") print(list.files(outdir)[grep(pattern = inVar, list.files(outdir))])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.