#' @title Distance of observed taxa versus sample distribution via Bayesian bootstrap
#' @description Field sampling is biased to particular parts of a gradient. As such taxa occupy
#' a particular part of this gradient. However, a increase or decrease in occurrence along this gradient
#' might simply because particular parts of the gradient are unequally sampled. The KS-test compares the
#' Theoretical Cumulative Distribution Function of a chosen distribution to the Empirical Cumulative
#' Distribution of a sample. The deviation of the two is expressed as D. This method compares the
#' Empirical Cumulative Distribution of a sampled taxa to the Empirical Cumulative Distribution of
#' the sample. D then denotes the "strength" of the deviation from the entire sampled gradient
#' (sample effort). The D is calculated by subtracting the Empirical Cumulative Distribution of
#' the samples from the Empirical Cumulative Distribution of the taxa. In both cases the Empirical
#' Cumulative Distributions are normalized between 0-1 as such D can only obtain
#' a value between -1 or 01. How closer this value is to 0 the less the distribution of the species
#' deviates from the "sample effort". The location wher D is either at minimal or maximal is
#' denoted as the Split (S) this value indicates the location along the gradient where the Deviation
#' between the Empirical Distributions is as maximum. In simplicity this can be noted as the sample
#' "threshold" where the species distribution along a gradient deviates at maximum from the sample gradient.
#' This whole process is bootstrapped with the "Bayesian Bootstrap" (BB) (Rubin et al. 1981), which is
#' more a smoothed version of the classical bootstrap. As the BB uses the Dirichlet distribution as
#' the prior reflects a flat/uniform prior returning the likelihood. The question is if this is really
#' Bayesian. Preferred is that the intervals are simply interpreted as confidence intervals considering
#' that the principle of indifference is not a reasonable assumption at al. The plot an idea are derived
#' from TITAN (Bakker and King, 2010). However, the method is comparatively faster simpler and easier
#' to interpret.
#' @param gradient A numeric vector representing the gradient along which the taxa has been observed (e.g. concentration of nutrients).
#' @param taxa A character vector with the names of all taxa needs to be the same length at the gradient.
#' @param xlab An argument representing the name of the gradient.
#' @param nboot The number of bootstraps used in the analysis.
#' @param size Size of the point in plot "A". Default is 10 but might be to large.
#'
#' @details The function returns a list of three objects: a data frame with the Taxa with D an S, a data frame with S for all taxa and
#' a plotted figure (ggplot2).
#'
#' @export Dfun
#'
#' @examples
#' \dontrun{
#'library(readr)
#'df <- read_csv(url("https://raw.githubusercontent.com/snwikaij/Data/main/Aquatic_Botany_Kaijser_et_al._2019.csv"))
#'
#'#Run the analysis
#' results <- Dfun(df$Mediaan, df$Soort, size=6)
#'
#'#Display the results
#' results$Plot}
Dfun <- function(gradient, taxa, xlab="Gradient", nboot=1000, size=10){
df <- cbind(gradient, as.character(taxa))
bb_fun <- function(x){
lx <- length(x)
wx <- matrix(rexp(lx, 1), ncol = lx, byrow = TRUE)
wx <- wx/rowSums(wx)
bx <- sample(x, size = lx, replace = T, prob = wx)
return(bx)}
cus_fun <- function(z){
z[,2] <- as.numeric(z[,2])
z <- z[order(z[,2]),]
z$V3 <- cumsum(1:nrow(z))/max(cumsum(1:nrow(z)))
return(z)}
looplist <- list()
for(l in 1:nboot){
print(l)
splitdf <- split(df[,1], df[,2])
w <- sapply(splitdf, function(x) bb_fun(x))
y <- list()
for(i in 1:length(w)){y[[i]] <- cbind(names(w)[i], w[[i]])}
v <- do.call(rbind.data.frame,y)
r <- cus_fun(v)
splitdfspec <- split(r, r$V1)
q <- lapply(splitdfspec, function(x) cbind(x[,1:2], cumsum(1:dim(x)[1])/max(cumsum(1:dim(x)[1]))))
lpdf <- data.frame(Taxa=rep(NA, length(q)), Distance=rep(NA, length(q)), Split=rep(NA, length(q)))
count <- 0
for(i in names(q)){
count <- count+1
s <- q[[i]]
rownames(s)<-NULL
n <- r[r$V1==i,]
Dl <- n$V3-s[,3]
m <- abs(Dl)
D <- max(m)
lpdf[count,] <- cbind(i, D*sign(Dl[which.max(m)]), s$V2[which.max(m)])
}
looplist[[l]] <- lpdf
}
f <- do.call(rbind, looplist)
res <- aggregate(data=f, .~Taxa, function(x) c(mean(as.numeric(x)), quantile(as.numeric(x), c(.025, 0.975))))
results <- cbind.data.frame(res$Taxa, res$Distance, res$Split)
colnames(results) <- c("Taxa", "D", "D1", "D2", "S", "S1", "S2")
results <- cbind.data.frame(results[c("Taxa")], apply(results[-1], 2, as.numeric))
pos <- results[results$D >= 0,]
neg <- results[results$D < 0,]
d <- rbind.data.frame(quantile(neg$S, c(.025, .975)), c(quantile(pos$S, c(.025, .975))))
d <- setNames(cbind(c("Negative", "Positive"), c(mean(neg$S), mean(pos$S)), d), c("Group", "S", "S1", "S2"))
rownames(d) <- NULL
d$Group <- factor(d$Group, levels = c("Positive", "Negative"))
pl1 <- ggplot(results, aes(x=S, y=reorder(Taxa, -S)))+
ylab("")+xlab("")+
geom_errorbarh(aes(xmin = S1, xmax = S2, height = 0))+
geom_point(aes(fill = results$D), fill = ifelse(results$D > 0, "white", "black"), pch=21, size = abs(results$D)*size)+
theme_classic()+
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = "black"),
axis.title.y = element_text(colour = "black"),
legend.position = "none")
pl2 <- ggplot(d, aes(x=S, y=Group))+
ylab("")+
xlab(xlab)+
xlim(ggplot_build(pl1)$layout$panel_scales_x[[1]]$range$range)+
geom_errorbarh(aes(xmin = S1, xmax = S2, height = 0))+
geom_point(fill = c("black", "white"), pch=21, size=6)+
theme_classic()+
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(colour = "black"),
axis.title.y = element_text(colour = "black"),
legend.position = "none")
plot <- cowplot::plot_grid(pl1,pl2,labels = "AUTO", align = "v", ncol=1,
rel_heights = c(1.6, 0.4))
return(list(Taxa=results, Binary=d, Plot=plot))}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.