#' Plot Taxa Response Curve and Calculate Species Extirpation Concentrations
#'
#' The output of this function to return 1.Weighted Average, 2. cdf_Abundance based, 3. cdf_ presence/absence based;
#' 4. ecdf weighted, 5. cdf weight new; 6. Linear logistic regression, 7. quadratic logistic 8. GAM 5~7 using full data range;
#' 9~11. repeat 6~8 but uses observed range for each single taxon; 12 Count. 13. Raw quantiles
#'
#' @param spdata Species data.
#' @param envdata Environmental data.
#' @param sp.siteid Site/sample id column for spdata; default = "Sample.ID"
#' @param species ; default = "GENUS"
#' @param sp.abndid Name of column for species relative abundance within sample; default = "RA"
#' @param env.siteid Site/sample id column for envdata; default = "Sample.ID"
#' @param xvar xvariable, could be column index or name; default = "COND"
#' @param cutoff a required minimum sample size for calculation; default = 30
#' @param region Region (e.g., ecoregion or bioregion) used to create directory for PDF output; default = "all"
#' @param lim if lim == "GAM", add gam plot xc95 otherwise, add "CDF"; default ="CDF"
#' @param coord ; default = NULL
#' @param mtype could be 1 to 3, indicating which regression model to use; default = 3
#' @param dense.N is the number of areas to cut into in the calculation of area under the curve; default = 201
#' @param plot.pdf to decide if we want species vs. env plots options "none", "pdf", "tiff"; default = F
#' @param add.map to decide if a map should be added before plots; default =F
#' @param statename ; default = NULL
#' @param add.lab ; default = F
#' @param main Plot title (main); default = "Capture Probability of Macroinvertebrate Taxon Along Conductivity Gradient"
#' @param mar ; default = c(5,4,3, 2)
#' @param xlabs ; default = expression(paste("Conductivity ( ", mu, "S/cm)"))
#' @param log.x if xvar should be logtransformated; default = TRUE
#' @param rounder xvar rounder, default = 0.
#' @param taus determine the output the percentile of env variable; default = c(0,95,100)
#' @param nbin Number of bins; default = 61.
#' @param wd Working directory for saving files.
#~~~~~~~~~~~~
# Lei had the ones below but they don't match the function
# @param df1, data frame
# @param sort.vect when plot, sort the taxa list according to a vector called file called sort.vec
# @param xvar. xvariable, could be column index or name
# @param cutoff a required minimum sample size for calculation
# @param region a subregion code to name the final output files
# @param mtype could be 1 to 3, indicating which regression model to use.
# @param dense.N is the number of areas to cut into in the calculation of area under the curve
# @param plot.pdf to decide if we want species vs. env plots options "none", "pdf", "tiff"
# @param add.map to decide if a map should be added before plots.
# @param maintext title of the multiplots area
# @param nbin number of bins for logits
# @param log.x if xvar should be logtransformated
# @param rounder xvar rounder, default is 0
# @param taus determine the output the percentile of env variable
# @param lim if lim == "GAM", add gam plot xc95 otherwise, add "CDF"
#~~~~~~~~~~~~~~
#' @return data frame and PDFS of CDF and GAM plots saved to the specified directory in subfolders ("cdf" and "gam")
#' @keywords logistic regression, quantiles, xc95, hc05, cdf, gam, taxon response
#' @examples
#' # Environmental Variables
#' varlist <- c("lgX3daymax", "lgMAF", "lgFallrate", "lgHigh1fall", "RBI", "lgX3daymin")
#' varnames <- c("3-day Max", "Mean Annual Flow", "Fall Rate", "High 1 Fall", "RBI", "3-Day Min")
#' # select variable
#' vari <- 1
#' mydata <- envdata.all[!is.na(envdata.all[,varlist[vari]]),]
#' # run function
#' whole.values <- taxon.response(spdata = species, envdata = mydata, sp.siteid = "BenSampID", species = "OTU",
#' sp.abndid = "RA", env.siteid = "BenSampID", xvar = varlist[vari], cutoff = 20, region = "tol_all", lim ="GAM",
#' coord = c("BioSta_Long", "BioSta_Lat"), mtype = 3, dense.N = 201, plot.pdf = T, add.map = F, statename = NULL,
#' add.lab = F, main = paste("Macroinvertebrates Response to", varnames[vari]), mar = c(5,4,3,2),
#' xlabs= varnames[vari], log.x = F, rounder = 3, taus = c(0,50,100), nbin = 51, wd = getwd())
#' # view results
#' View(whole.values)
#' @export
taxon.response <- function(spdata, envdata, sp.siteid="Sample.ID", species="GENUS", sp.abndid="RA",
env.siteid="Sample.ID", xvar="COND", cutoff=30, region = "all", lim ="CDF", coord = NULL,
mtype = 3, dense.N = 201, plot.pdf = F, add.map=F, statename = NULL, add.lab = F,
main = "Capture Probability of Macroinvertebrate Taxon Along Conductivity Gradient",
mar = c(5,4,3, 2), xlabs=expression(paste("Conductivity ( ", mu, "S/cm)")), log.x=TRUE,
rounder=0, taus=c(0,95,100), nbin = 61, wd=getwd()) {##FUNCTION.taxon.response.START
#
graphics.off()
# 20170417, add directory check
dir.check.add(wd,region)
#
dfenv <- envdata[, c(env.siteid, xvar, coord)]
xlims <- range(dfenv[,xvar], na.rm = TRUE)
dfenv[,xvar] <- (dfenv[,xvar] - xlims[1])/diff(xlims)
# xrange <- dfenv[,xvar]
bcnt0 <- merge(spdata, envdata, by.x = sp.siteid, by.y = env.siteid) # big table merged togetther
# print(dim(bcnt0))
bcnt0 <- (bcnt0[bcnt0[, sp.abndid]>0,])
# print(dim(bcnt0))
# assign station names to sitenames
sitenames.u <- sort(unique(unlist(bcnt0[,sp.siteid]))) # unique site names
nsamp <- length(sitenames.u) # count total number of stations
luniq <- function(x) length(unique(x)) # function to count unique id
numocc <- tapply(bcnt0[, sp.siteid], bcnt0[,species], luniq) # No. of occurance(unique stations) for a list of taxa
# numocc2 <- nsamp - numocc # total stations- vector of stations for all taxa
# numocc3 <- pmin(numocc, numocc2, na.rm= TRUE) # find the min of present and absent's sites
numocc <- numocc[!is.na(numocc)]
incvec <- numocc >= cutoff # presence/absence min >= cutoff value
tnames <- sort(names(numocc)[incvec]) # selected taxa list
ntaxa <- length(tnames) # length of taxa list for taxonomic level i
# Build site species matrix for only taxa with enough observations
abund.all <- bcnt0[, sp.abndid] # abundance only
ss1 <- matrix(0, ncol = ntaxa, nrow = nsamp) # build a matrix to store crosstab table
for (j in 1:ntaxa) {##FOR.j.START # all taxa in j list
incvec <- bcnt0[, species] == tnames[j] # match a taxon name at the selected taxonomic level with tnames.loc[j]
incvec[is.na(incvec)] <- FALSE
sitenames.g <- bcnt0[incvec,sp.siteid] # selected rows/station for taxon j
abund.g <- bcnt0[incvec,sp.abndid] # selected abundance data for taxon j
abund.s <- tapply(abund.g, sitenames.g, sum) # crosstab total abundance for taxon j
abund.s[is.na(abund.s)] <- 0
sitenames.s <- names(abund.s)
for (k in 1:length(sitenames.s)) {
ss1[match(sitenames.s[k], sitenames.u), j] <- abund.s[k]
}
}##FOR.j.END
ss <- data.frame(sitenames.u, ss1)
names(ss) <- c("SITEID", tnames)
df1 <- merge(ss, dfenv, by.x = "SITEID", by.y = env.siteid) # merged crosstab abundance file
df1[is.na(df1[2:(ntaxa + 1)]),2:(ntaxa + 1)] <- 0
nc <- ncol(df1)
df1 <- df1[order(df1[,xvar]),]
df2 <- df1
df2[2:(ntaxa + 1)][df2[2:(ntaxa + 1)] > 0] <- 1 # p/a file
stress <- df1[,xvar][!is.na(df1[,xvar])]
stress.u <- sort(unique(stress))
xrange <- range(stress)
xnew <- seq(from = xrange[1], to = xrange[2], length = 100)
###################### A weighted function to calculate cumsum
ecdf.w <- function (x,w) {##FUNCTION.ecdf.w.START
w1 <- round(1000000*w)
x0 <- numeric(0)
for (i in 1:length(x)) {
x0 <- c(x0, rep(x[i], times = w1[i]))
}
x0 <- sort(x0)
n <- length(x0)
if (n < 1)
stop("'x' must have 1 or more non-missing values")
vals <- unique(x0)
rval <- approxfun(vals, cumsum(tabulate(match(x0, vals)))/n,
method = "constant", yleft = 0, yright = 1,
f = 0, ties = "ordered")
class(rval) <- c("ecdf", "stepfun", class(rval))
attr(rval, "call") <- sys.call()
rval
}##FUNCTION.ecdf.w.END
###### calculate weight for cdf
cutp <- seq(from = min(stress), to = max(stress), length = nbin)
cutm <- 0.5*(cutp[-1] + cutp[-nbin])
df2$cutf <- cut(df2[,xvar], cutp, include.lowest = T)
wt <- 1/table(df2$cutf)
wt[is.infinite(wt)] <- NA
wt <- wt/sum(wt, na.rm = T)
dftemp <- data.frame(cutf = names(wt), wt = as.vector(wt))
df2 <- merge(df2, dftemp, by = "cutf")[-1]
df2 <- df2[order(df2[,xvar]),]
#### end
# ##### function to compute area under the curve
# auc <- function(xrange, mod, dense.N) {##FUNCTION.auc.START
# x <- seq(min(xrange), max(xrange), length = dense.N - 1)
# s.area <-rep(NA, dense.N - 1)
# y <- predict(mod, newdata = data.frame(dose = x), type = "response")
# for (index in 1:dense.N-1) {
# s.area[index] <- (y[index] + y[index + 1])/2*(x[index+1]-x[index])
# }
# tsum <- sum(s.area, na.rm=T)
# jj=1
# csum = sum(s.area[1:jj])
# while(csum < taus[2]/100*tsum) {
# jj = jj + 1
# csum <- sum(s.area[1:jj], na.rm=T)
# }
#
# xc95 <- (x[jj]+ x[jj-1])/2
# yc95 <- (y[jj]+ y[jj-1])/2
# return(c(xc95,yc95))
# }##FUNCTION.auc.END
# print(dim(df1))
# plot pdf option
if(plot.pdf) {##IF.plot.pdf.START
pdf(file = paste(wd,"/", region, "/", region, ".taxon.cdf.pdf",sep=""),
width = 9, height = 6, pointsize = 12)
par(mfrow = c(2, 3), pty = "m", mar = mar, oma=c(1.5, 0.5, 2,0.5) )
pdf(file = paste(wd,"/", region, "/", region, ".taxon.gam.pdf",sep=""),
width = 9, height = 6.5, pointsize = 12)
par(mfrow = c(2, 3), pty = "m", mar =mar, oma=c(1.5, 0.5, 2,0.5) )
totpage <- ceiling((length(tnames)+1)/6)
if(add.map) {
simpleCap <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
}
med = 1
while(is.na(df1[med, coord[1]])) med = 1 + med
if(is.null(statename)) {
statename <<- map.where("state", df1[med, coord[1]], df1[med, coord[2]])
statename <- simpleCap(statename)
}
if(add.lab) {
map.text("state", region = statename, mar = mar)
} else {
map("state", regions = statename, mar = mar)
}
text(df1[,coord[1]],df1[,coord[2]],".")
# mtext(paste("Ecoregion",region), side =3,line = 0.5, cex=1.5)
}
}##IF.plot.pdf.END
optsave <- matrix(NA, ncol = length(taus) + 12, nrow = ntaxa)
##~~~~~~~~~~~~~~~
for (i in 1:ntaxa) {##FOR.i.START
isel <- match(tnames[i], names(df1)) # selected taxa i
## (1) Weighted averaging
WA <- sum(df1[,isel]*df1[,xvar])/sum(df1[,isel])
tol <- sqrt(sum(df1[,isel] * (df1[,xvar]- WA)^2) /sum(df1[,isel]))
# WTA <- sum(df1[,isel]*df1[,xvar]*df2$wt)/sum(df1[,isel]*df2$wt) # sample weighted weighted average
### (2) cdf or cumsum no weighting both abundance based and p/a based
csum1 <-cumsum(df1[,isel])/sum(df1[,isel]) # cumlative vectors devided total abundance
ic1 <- 1
while(csum1[ic1] < taus[2]/100) ic1<- ic1 + 1
#### (3) cdf p/a
csum2 <-cumsum(df2[,isel])/sum(df2[,isel]) # cumlative vectors devided total abundance
ic2 <- 1
while(csum2[ic2] < taus[2]/100) ic2<- ic2 + 1
##### (4) cdf weighted
resp <- df2[, isel] > 0
sel <- df2[resp,]
# eout <- ecdf.w(sel[,xvar], sel$wt)
# ic3 <- 1
# while(eout(stress.u[ic3]) < taus[2]/100) ic3 <- ic3 + 1
eout2 <- Hmisc::wtd.quantile(sel[,xvar], sel$wt, normwt = TRUE, prob = taus[2]/100)
######### (5)(6)(7) logistic regression model
resp <- df2[, isel]
dose <- df2[, xvar]
lrm1 <- glm(resp ~ dose, family="binomial")
lrm2 <- glm(resp ~ dose + I(dose^2), family="binomial")
lrm3 <- mgcv::gam(resp ~ s(dose,k=3), family= "binomial")
if(mtype == 1) model <- lrm1
if(mtype == 2) model <- lrm2
if(mtype == 3) model <- lrm3
predresp<- predict(model, newdata = data.frame(dose=xnew), type="link", se.fit=T)
# Compute upper and lower 90% confidence limits
up.bound.link <- predresp$fit + qnorm(0.95)*predresp$se.fit
low.bound.link <- predresp$fit - qnorm(0.95)*predresp$se.fit
mean.resp.link <- predresp$fit
# Convert from logit transformed values to probability.
up.bound <- exp(up.bound.link)/(1+exp(up.bound.link))
low.bound <- exp(low.bound.link)/(1+exp(low.bound.link))
mean.resp <- exp(mean.resp.link)/(1+exp(mean.resp.link))
#### 5,6,7 full range
auc.version <- "response"
######## find modeled xc95 full data range
lrm1.95f <- auc(xrange = xrange, mod = lrm1, dense.N = dense.N, taus, auc.version)
lrm2.95f <- auc(xrange = xrange, mod = lrm2, dense.N = dense.N, taus, auc.version)
lrm3.95f <- auc(xrange = xrange, mod = lrm3, dense.N = dense.N, taus, auc.version)
#### 8,9,10 observed range
######## find modeled xc95 observed data range
shortrange <- range(df2[df2[,isel]>0,xvar])
lrm1.95p <- auc(xrange = shortrange, mod = lrm1, dense.N = dense.N, taus, auc.version)
lrm2.95p <- auc(xrange = shortrange, mod = lrm2, dense.N = dense.N, taus, auc.version)
lrm3.95p <- auc(xrange = shortrange, mod = lrm3, dense.N = dense.N, taus, auc.version)
##### 11 n values 12 quantiles
samplen <- length(df2[df2[,isel]>0,xvar])
limits<- quantile(df2[df2[,isel]>0,xvar],probs=taus/100) # quantile calculation
###### Save results
if(log.x) {##IF.log.x.START
optsave[i,1:(length(taus)+11)] <- round(10^(c(WA, tol, df1[ic1,xvar],df1[ic2,xvar], eout2,
lrm1.95f[1],lrm2.95f[1],lrm3.95f[1],lrm1.95p[1],lrm2.95p[1],lrm3.95p[1], limits)*
diff(xlims) + xlims[1]), rounder) # WA optimum
} else {
optsave[i,1:(length(taus)+11)] <- round(c(WA, tol,df1[ic1,xvar],df1[ic2,xvar],eout2,
lrm1.95f[1],lrm2.95f[1],lrm3.95f[1],lrm1.95p[1],lrm2.95p[1],lrm3.95p[1], limits)*
diff(xlims) + xlims[1], rounder)
}##IF.log.x.END
optsave[i,(length(taus)+12)] <- samplen
#### The following code calcualte probabilities of occurrence
binf <- cut(df2[,xvar], cutp, include.lowest = T)
bvals <- tapply(df2[, isel] > 0, binf, mean, na.rm =T)
# optiion for plot single species env graph
if(plot.pdf) {##IF.plot.pdf.START
dev.set(3)
plot(xnew, mean.resp, axes = FALSE, type = "l", xlab = "", ylab = "", xlim=c(0,1),
ylim = range(low.bound, up.bound, bvals, na.rm =T), col=1)
points(xnew, low.bound, type="l", col=1, lty="dashed")
points(xnew, up.bound, type="l", col=1,lty="dashed")
points(cutm, bvals, pch=21, col= 1, cex = 0.7, bg ="gray") # probability
# full <- get(paste("lrm",mtype,".95f", sep=""))
# part <- get(paste("lrm",mtype,".95p", sep=""))
# segments(part[1], part[2],part[1],0, col="red", lty="dashed") # partial range
# segments(full[1], full[2],full[1],0, col="blue", lty= "dashed") # full range
if(lim =="GAM") { abline(v= lrm3.95f[1], lty= "dashed", col="red") ##IF.lim.START
} else if(lim =="CDF"){
abline(v= eout2, lty= "dashed", col="blue")
}##IF.lim.END
max.pow <- ceiling(max(xlims)); min.pow <- floor(min(xlims)) ## add x range for log formation
if(log.x) {##IF.log.x.START
at0 <- (min.pow:max.pow - xlims[1])/diff(xlims) # add ticks at log10 = even
lab1 <- 10^(min.pow:max.pow)
axis(1, at = at0, labels = lab1, lwd.ticks = 1.2) # major ticks with labels
axis(1, at = (log10(1:10 * rep(lab1[-1]/10, each = 10)) -xlims[1])/diff(xlims), tcl = -0.3,
labels = FALSE, lwd.ticks = 0.8) # minor ticks xtick <- at0 <= max(x) & axis.at >= xmin # major labels
xtick <- at0 <= max(df1[,xvar]) & at0 >= min(df1[,xvar]) # major labels
### if only two or fewer major lables, then add tick labels at 2 and 5 log
if(sum(xtick)<=2) {##IF.xtick.START
axis(1, at = (log10(c(2, 5) * rep(lab1[-1]/10, each = 2)) -xlims[1])/diff(xlims), tcl = -0.4,lwd.ticks= 1.2,
labels = (c(2, 5) * rep(lab1[-1]/10, each = 2)))
}##IF.xtick.END
} else {
at0 <- (pretty(xlims) - xlims[1])/diff(xlims) # add ticks
# lab1 <- round(at0*diff(xlims) + xlims[1], digits = rounder) # scale back to orginial value
axis(1, at = at0, labels = pretty(xlims), lwd.ticks = 1.5, las = 1) # major ticks with labels
}##IF.log.x.END
axis(2)
# axis(4)
# mtext("Relative Abundance", side = 4, line = 2.5, col=1)
mtext(xlabs, side = 1, line = 2.3)
mtext("Capture Probability", side = 2, line = 2.3,col=1)
mtext(bquote(italic(.(tnames[i]))), side = 3, line = 0.5)
box()
if((i+5)/6-round((i+5)/6)==0) {##IF.i+5.START
title(main= main, outer=TRUE, cex.main= 1.8)
mtext(paste("Page", (i+5)/6, "of", totpage), outer = TRUE, side =1)
}##IF.i+5.END
#### plot cdf
dev.set(2)
Hmisc::Ecdf(sel[,xvar], weights = sel$wt, xlim = c(0,1), col = "blue", pch = 1,
axes = F, main = tnames[i], xlab = xlabs)
# plot.ecdf(sel[,xvar], xlim = c(0,1), col = "green", pch = 1,add = T,
# axes = F, main = tnames[i], xlab = xlabs)
lab1 <- round(axTicks(1)*diff(xlims) + xlims[1], digits = rounder) # scale back to orginial value
axis(1, at = axTicks(1), labels = lab1, lwd.ticks = 1.5, las = 1) # major ticks with labels
axis(2)
abline(h = c(taus[2]/100), col = "red", lty = 2)
abline(v = eout2, col = "red", lty = 2)
box(bty = "l")
}##IF.plot.pdf.END
}##FOR.i.END
optout <- data.frame(tnames, optsave)
names(optout) <- c("taxaname", "WAopt","WAtol", "CDF_ABD","CDF_PA", "CDF_WT_QTILE", "LRM_Full",
"QLRM_Full","GAM_Full", "LRM_Ob", "QLRM_Ob", "GAM_Ob",
paste("Observ",taus, sep=""),"N")
if(plot.pdf) graphics.off()
return(optout)
}##FUNCTION.taxon.response.END
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## inverse deshrinking function
## `inv.deshrink` <- function(env, wa.env) {
## X <- cbind(rep(1, length(wa.env)), wa.env)
## QR <- qr(X)
## coef <- qr.coef(QR, env)
## pred <- qr.fitted(QR, env)
## return(list(coefficients = coef, env = pred))
## }
##
## classical deshrinking
## `class.deshrink` <- function(env, wa.env) {
## X <- cbind(rep(1, length(env)), env)
## QR <- qr(X)
## coef <- drop(qr.coef(QR, wa.env))
## coef <- c(-coef[1], 1)/coef[2]
## pred <- deshrink.pred(wa.env, coef)
## return(list(coefficients = coef, env = pred))
## }
##
## deshrinking to equal sd
## A bit like in vegan:::wascores, but wascores uses weighted sd which
## would need row and column sums in the function call, and this would
## make the function API incompatible with other *.deshrink functions.
## `expand.deshrink` <- function(env, wa.env) {
## b1 <- sd(env)/sd(wa.env)
## b0 <- mean(env) - b1 * mean(wa.env)
## pred <- b0 + b1 * wa.env
## return(list(coefficients = c(b0, b1), env = pred))
## }
##
# Do not deshrink: for those who think they know what they do
## `no.deshrink` <- function(env, wa.env) {
## return(list(coefficients = c(0, 1), env = wa.env))
## }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.