knitr::opts_chunk$set(echo = TRUE)
This is an R Markdown Notebook to illustrate how to retrieve a dataset from the EcoSIS spectral database, choose the "optimal" number of plsr components, and fit a plsr model for specific leaf area (SLA). In this example, the plants were cultivated in an outdoor setting in the botanical garden of the KIT using 40x40 cm pots with an standardized substrate. The data was measured on a weekly basis (the timestamp is included in the dataset).
list.of.packages <- c("pls","dplyr","reshape2","here","plotrix","ggplot2","gridExtra", "spectratrait") invisible(lapply(list.of.packages, library, character.only = TRUE))
### Setup options # Script options pls::pls.options(plsralg = "oscorespls") pls::pls.options("plsralg") # Default par options opar <- par(no.readonly = T) # What is the target variable? inVar <- "SLA_g_cm" # What is the source dataset from EcoSIS? ecosis_id <- "3cf6b27e-d80e-4bc7-b214-c95506e46daa" # Specify output directory, output_dir # Options: # tempdir - use a OS-specified temporary directory # user defined PATH - e.g. "~/scratch/PLSR" output_dir <- "tempdir"
outdir <- tempdir() setwd(outdir) # set working directory print(paste0("Output directory: ",getwd()))
print(paste0("Output directory: ",getwd())) # check wd ### Get source dataset from EcoSIS dat_raw <- spectratrait::get_ecosis_data(ecosis_id = ecosis_id) head(dat_raw) names(dat_raw)[1:40]
### Create plsr dataset Start.wave <- 500 End.wave <- 2400 wv <- seq(Start.wave,End.wave,1) Spectra <- as.matrix(dat_raw[,names(dat_raw) %in% wv]) colnames(Spectra) <- c(paste0("Wave_",wv)) sample_info <- dat_raw[,names(dat_raw) %notin% seq(350,2500,1)] head(sample_info) sample_info2 <- sample_info %>% select(Plant_Species=species,Growth_Form=`growth form`,timestamp, SLA_g_cm=`SLA (g/cm )`) %>% mutate(SLA_g_cm=as.numeric(SLA_g_cm)) # ensure SLA is numeric head(sample_info2) plsr_data <- data.frame(sample_info2,Spectra) rm(sample_info,sample_info2,Spectra)
#### End user needs to do what's appropriate for their data. This may be an iterative process. # Keep only complete rows of inVar and spec data before fitting plsr_data <- plsr_data[complete.cases(plsr_data[,names(plsr_data) %in% c(inVar,wv)]),] # Remove suspect high values plsr_data <- plsr_data[ plsr_data[,inVar] <= 500, ]
### Create cal/val datasets ## Make a stratified random sampling in the strata USDA_Species_Code and Domain method <- "base" #base/dplyr # base R - a bit slow # dplyr - much faster split_data <- spectratrait::create_data_split(dataset=plsr_data, approach=method, split_seed=2356812, prop=0.8, group_variables="Plant_Species") names(split_data) cal.plsr.data <- split_data$cal_data val.plsr.data <- split_data$val_data rm(split_data) # Datasets: print(paste("Cal observations: ",dim(cal.plsr.data)[1],sep="")) print(paste("Val observations: ",dim(val.plsr.data)[1],sep="")) cal_hist_plot <- ggplot(data = cal.plsr.data, aes(x = cal.plsr.data[,paste0(inVar)])) + geom_histogram(fill=I("grey50"),col=I("black"),alpha=I(.7)) + labs(title=paste0("Calibration Histogram for ",inVar), x = paste0(inVar), y = "Count") val_hist_plot <- ggplot(data = val.plsr.data, aes(x = val.plsr.data[,paste0(inVar)])) + geom_histogram(fill=I("grey50"),col=I("black"),alpha=I(.7)) + labs(title=paste0("Validation Histogram for ",inVar), x = paste0(inVar), y = "Count") histograms <- grid.arrange(cal_hist_plot, val_hist_plot, ncol=2) ggsave(filename = file.path(outdir,paste0(inVar,"_Cal_Val_Histograms.png")), plot = histograms, device="png", width = 30, height = 12, units = "cm", dpi = 300) # output cal/val data write.csv(cal.plsr.data,file=file.path(outdir,paste0(inVar,'_Cal_PLSR_Dataset.csv')), row.names=FALSE) write.csv(val.plsr.data,file=file.path(outdir,paste0(inVar,'_Val_PLSR_Dataset.csv')), row.names=FALSE)
### Format PLSR data for model fitting cal_spec <- as.matrix(cal.plsr.data[, which(names(cal.plsr.data) %in% paste0("Wave_",wv))]) cal.plsr.data <- data.frame(cal.plsr.data[, which(names(cal.plsr.data) %notin% paste0("Wave_",wv))], Spectra=I(cal_spec)) val_spec <- as.matrix(val.plsr.data[, which(names(val.plsr.data) %in% paste0("Wave_",wv))]) val.plsr.data <- data.frame(val.plsr.data[, which(names(val.plsr.data) %notin% paste0("Wave_",wv))], Spectra=I(val_spec))
par(mfrow=c(1,2)) # B, L, T, R spectratrait::f.plot.spec(Z=cal.plsr.data$Spectra,wv=wv,plot_label="Calibration") spectratrait::f.plot.spec(Z=val.plsr.data$Spectra,wv=wv,plot_label="Validation") dev.copy(png,file.path(outdir,paste0(inVar,'_Cal_Val_Spectra.png')), height=2500,width=4900, res=340) dev.off(); par(mfrow=c(1,1))
### Use permutation to determine the optimal number of components if(grepl("Windows", sessionInfo()$running)){ pls.options(parallel = NULL) } else { pls.options(parallel = parallel::detectCores()-1) } method <- "pls" #pls, firstPlateau, firstMin random_seed <- 2356812 seg <- 100 maxComps <- 18 iterations <- 50 prop <- 0.70 if (method=="pls") { # pls package approach - faster but estimates more components.... nComps <- spectratrait::find_optimal_components(dataset=cal.plsr.data, targetVariable=inVar, method=method, maxComps=maxComps, seg=seg, random_seed=random_seed) print(paste0("*** Optimal number of components: ", nComps)) } else { nComps <- spectratrait::find_optimal_components(dataset=cal.plsr.data, targetVariable=inVar, method=method, maxComps=maxComps, iterations=iterations, seg=seg, prop=prop, random_seed=random_seed) } dev.copy(png,file.path(outdir,paste0(paste0(inVar,"_PLSR_Component_Selection.png"))), height=2800, width=3400, res=340) dev.off();
segs <- 100 plsr.out <- plsr(as.formula(paste(inVar,"~","Spectra")),scale=FALSE,ncomp=nComps,validation="CV", segments=segs, segment.type="interleaved",trace=FALSE,data=cal.plsr.data) fit <- plsr.out$fitted.values[,1,nComps] pls.options(parallel = NULL) # External validation fit stats par(mfrow=c(1,2)) # B, L, T, R pls::RMSEP(plsr.out, newdata = val.plsr.data) plot(pls::RMSEP(plsr.out,estimate=c("test"),newdata = val.plsr.data), main="MODEL RMSEP", xlab="Number of Components",ylab="Model Validation RMSEP",lty=1,col="black",cex=1.5,lwd=2) box(lwd=2.2) pls::R2(plsr.out, newdata = val.plsr.data) plot(R2(plsr.out,estimate=c("test"),newdata = val.plsr.data), main="MODEL R2", xlab="Number of Components",ylab="Model Validation R2",lty=1,col="black",cex=1.5,lwd=2) box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(paste0(inVar,"_Validation_RMSEP_R2_by_Component.png"))), height=2800, width=4800, res=340) dev.off(); par(opar)
#calibration cal.plsr.output <- data.frame(cal.plsr.data[, which(names(cal.plsr.data) %notin% "Spectra")], PLSR_Predicted=fit, PLSR_CV_Predicted=as.vector(plsr.out$validation$pred[,,nComps])) cal.plsr.output <- cal.plsr.output %>% mutate(PLSR_CV_Residuals = PLSR_CV_Predicted-get(inVar)) head(cal.plsr.output) cal.R2 <- round(pls::R2(plsr.out,intercept=F)[[1]][nComps],2) cal.RMSEP <- round(sqrt(mean(cal.plsr.output$PLSR_CV_Residuals^2)),2) val.plsr.output <- data.frame(val.plsr.data[, which(names(val.plsr.data) %notin% "Spectra")], PLSR_Predicted=as.vector(predict(plsr.out, newdata = val.plsr.data, ncomp=nComps, type="response")[,,1])) val.plsr.output <- val.plsr.output %>% mutate(PLSR_Residuals = PLSR_Predicted-get(inVar)) head(val.plsr.output) val.R2 <- round(pls::R2(plsr.out,newdata=val.plsr.data,intercept=F)[[1]][nComps],2) val.RMSEP <- round(sqrt(mean(val.plsr.output$PLSR_Residuals^2)),2) rng_quant <- quantile(cal.plsr.output[,inVar], probs = c(0.001, 0.999)) cal_scatter_plot <- ggplot(cal.plsr.output, aes(x=PLSR_CV_Predicted, y=get(inVar))) + theme_bw() + geom_point() + geom_abline(intercept = 0, slope = 1, color="dark grey", linetype="dashed", linewidth=1.5) + xlim(rng_quant[1], rng_quant[2]) + ylim(rng_quant[1], rng_quant[2]) + labs(x=paste0("Predicted ", paste(inVar), " (units)"), y=paste0("Observed ", paste(inVar), " (units)"), title=paste0("Calibration: ", paste0("Rsq = ", cal.R2), "; ", paste0("RMSEP = ", cal.RMSEP))) + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) cal_resid_histogram <- ggplot(cal.plsr.output, aes(x=PLSR_CV_Residuals)) + geom_histogram(alpha=.5, position="identity") + geom_vline(xintercept = 0, color="black", linetype="dashed", linewidth=1) + theme_bw() + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) rng_quant <- quantile(val.plsr.output[,inVar], probs = c(0.001, 0.999)) val_scatter_plot <- ggplot(val.plsr.output, aes(x=PLSR_Predicted, y=get(inVar))) + theme_bw() + geom_point() + geom_abline(intercept = 0, slope = 1, color="dark grey", linetype="dashed", linewidth=1.5) + xlim(rng_quant[1], rng_quant[2]) + ylim(rng_quant[1], rng_quant[2]) + labs(x=paste0("Predicted ", paste(inVar), " (units)"), y=paste0("Observed ", paste(inVar), " (units)"), title=paste0("Validation: ", paste0("Rsq = ", val.R2), "; ", paste0("RMSEP = ", val.RMSEP))) + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) val_resid_histogram <- ggplot(val.plsr.output, aes(x=PLSR_Residuals)) + geom_histogram(alpha=.5, position="identity") + geom_vline(xintercept = 0, color="black", linetype="dashed", linewidth=1) + theme_bw() + theme(axis.text=element_text(size=18), legend.position="none", axis.title=element_text(size=20, face="bold"), axis.text.x = element_text(angle = 0,vjust = 0.5), panel.border = element_rect(linetype = "solid", fill = NA, linewidth=1.5)) # plot cal/val side-by-side scatterplots <- grid.arrange(cal_scatter_plot, val_scatter_plot, cal_resid_histogram, val_resid_histogram, nrow=2, ncol=2) ggsave(filename = file.path(outdir,paste0(inVar,"_Cal_Val_Scatterplots.png")), plot = scatterplots, device="png", width = 32, height = 30, units = "cm", dpi = 300)
vips <- spectratrait::VIP(plsr.out)[nComps,] par(mfrow=c(2,1)) plot(plsr.out, plottype = "coef",xlab="Wavelength (nm)", ylab="Regression coefficients",legendpos = "bottomright", ncomp=nComps,lwd=2) box(lwd=2.2) plot(seq(Start.wave,End.wave,1),vips,xlab="Wavelength (nm)",ylab="VIP",cex=0.01) lines(seq(Start.wave,End.wave,1),vips,lwd=3) abline(h=0.8,lty=2,col="dark grey") box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(inVar,'_Coefficient_VIP_plot.png')), height=3100, width=4100, res=340) dev.off(); par(opar)
if(grepl("Windows", sessionInfo()$running)){ pls.options(parallel =NULL) } else { pls.options(parallel = parallel::detectCores()-1) } seg <- 100 jk.plsr.out <- pls::plsr(as.formula(paste(inVar,"~","Spectra")), scale=FALSE, center=TRUE, ncomp=nComps, validation="CV", segments = seg, segment.type="interleaved", trace=FALSE, jackknife=TRUE, data=cal.plsr.data) pls.options(parallel = NULL) Jackknife_coef <- f.coef.valid(plsr.out = jk.plsr.out, data_plsr = cal.plsr.data, ncomp = nComps, inVar=inVar) Jackknife_intercept <- Jackknife_coef[1,,,] Jackknife_coef <- Jackknife_coef[2:dim(Jackknife_coef)[1],,,] interval <- c(0.025,0.975) Jackknife_Pred <- val.plsr.data$Spectra %*% Jackknife_coef + matrix(rep(Jackknife_intercept, length(val.plsr.data[,inVar])), byrow=TRUE, ncol=length(Jackknife_intercept)) Interval_Conf <- apply(X = Jackknife_Pred, MARGIN = 1, FUN = quantile, probs=c(interval[1], interval[2])) sd_mean <- apply(X = Jackknife_Pred, MARGIN = 1, FUN =sd) sd_res <- sd(val.plsr.output$PLSR_Residuals) sd_tot <- sqrt(sd_mean^2+sd_res^2) val.plsr.output$LCI <- Interval_Conf[1,] val.plsr.output$UCI <- Interval_Conf[2,] val.plsr.output$LPI <- val.plsr.output$PLSR_Predicted-1.96*sd_tot val.plsr.output$UPI <- val.plsr.output$PLSR_Predicted+1.96*sd_tot head(val.plsr.output)
spectratrait::f.plot.coef(Z = t(Jackknife_coef), wv = wv, plot_label="Jackknife regression coefficients",position = 'bottomleft') abline(h=0,lty=2,col="grey50") box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(inVar,'_Jackknife_Regression_Coefficients.png')), height=2100, width=3800, res=340) dev.off();
rmsep_percrmsep <- spectratrait::percent_rmse(plsr_dataset = val.plsr.output, inVar = inVar, residuals = val.plsr.output$PLSR_Residuals, range="full") RMSEP <- rmsep_percrmsep$rmse perc_RMSEP <- rmsep_percrmsep$perc_rmse r2 <- round(pls::R2(plsr.out, newdata = val.plsr.data, intercept=F)$val[nComps],2) expr <- vector("expression", 3) expr[[1]] <- bquote(R^2==.(r2)) expr[[2]] <- bquote(RMSEP==.(round(RMSEP,2))) expr[[3]] <- bquote("%RMSEP"==.(round(perc_RMSEP,2))) rng_vals <- c(min(val.plsr.output$LPI), max(val.plsr.output$UPI)) par(mfrow=c(1,1), mar=c(4.2,5.3,1,0.4), oma=c(0, 0.1, 0, 0.2)) plotrix::plotCI(val.plsr.output$PLSR_Predicted,val.plsr.output[,inVar], li=val.plsr.output$LPI, ui=val.plsr.output$UPI, gap=0.009,sfrac=0.004, lwd=1.6, xlim=c(rng_vals[1], rng_vals[2]), ylim=c(rng_vals[1], rng_vals[2]), err="x", pch=21, col="black", pt.bg=scales::alpha("grey70",0.7), scol="grey50", cex=2, xlab=paste0("Predicted ", paste(inVar), " (units)"), ylab=paste0("Observed ", paste(inVar), " (units)"), cex.axis=1.5,cex.lab=1.8) abline(0,1,lty=2,lw=2) legend("topleft", legend=expr, bty="n", cex=1.5) box(lwd=2.2) dev.copy(png,file.path(outdir,paste0(inVar,"_PLSR_Validation_Scatterplot.png")), height=2800, width=3200, res=340) dev.off();
out.jk.coefs <- data.frame(Iteration=seq(1,seg,1), Intercept=Jackknife_intercept,t(Jackknife_coef)) head(out.jk.coefs)[1:6] write.csv(out.jk.coefs,file=file.path(outdir, paste0(inVar, '_Jackkife_PLSR_Coefficients.csv')), row.names=FALSE)
print(paste("Output directory: ", getwd())) # Observed versus predicted write.csv(cal.plsr.output,file=file.path(outdir, paste0(inVar,'_Observed_PLSR_CV_Pred_', nComps,'comp.csv')), row.names=FALSE) # Validation data write.csv(val.plsr.output,file=file.path(outdir, paste0(inVar,'_Validation_PLSR_Pred_', nComps,'comp.csv')), row.names=FALSE) # Model coefficients coefs <- coef(plsr.out,ncomp=nComps,intercept=TRUE) write.csv(coefs,file=file.path(outdir, paste0(inVar,'_PLSR_Coefficients_', nComps,'comp.csv')), row.names=TRUE) # PLSR VIP write.csv(vips,file=file.path(outdir, paste0(inVar,'_PLSR_VIPs_', nComps,'comp.csv')))
print("**** PLSR output files: ") print(list.files(outdir)[grep(pattern = inVar, list.files(outdir))])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.