NEMO: What the Package Does (One Line, Title Case)

Documented in Dfun

#' @title Distance of observed taxa versus sample distribution via Bayesian bootstrap
#' @description Field sampling is biased to particular parts of a gradient. As such taxa occupy
#' a particular part of this gradient. However, a increase or decrease in occurrence along this gradient
#' might simply because particular parts of the gradient are unequally sampled. The KS-test compares the
#' Theoretical Cumulative Distribution Function of a chosen distribution to the Empirical Cumulative
#' Distribution of a sample. The deviation of the two is expressed as D. This method compares the
#' Empirical Cumulative Distribution of a sampled taxa to the Empirical Cumulative Distribution of
#'  the sample. D then denotes the "strength" of the deviation from the entire sampled gradient
#'  (sample effort). The D is calculated by subtracting the Empirical Cumulative Distribution of
#'  the samples from the Empirical Cumulative Distribution of the taxa. In both cases the Empirical
#'  Cumulative Distributions  are normalized between 0-1 as such D can only obtain
#'  a value between -1 or 01. How closer this value is to 0 the less the distribution of the species
#'  deviates from the "sample effort". The location wher D is either at minimal or maximal is
#'  denoted as the Split (S) this value indicates the location along the gradient where the Deviation
#'  between the Empirical Distributions is as maximum. In simplicity this can be noted as the sample
#'  "threshold" where the species distribution along a gradient deviates at maximum from the sample gradient.
#'  This whole process is bootstrapped with the "Bayesian Bootstrap" (BB) (Rubin et al. 1981), which is
#'  more a smoothed version of the classical bootstrap. As the BB uses the Dirichlet distribution as
#'  the prior reflects a flat/uniform prior returning the likelihood. The question is if this is really
#'  Bayesian. Preferred is that the intervals are simply interpreted as confidence intervals considering
#'  that the principle of indifference is not a reasonable assumption at al. The plot an idea are derived
#'  from TITAN (Bakker and King, 2010). However, the method is comparatively faster simpler and easier
#'  to interpret.
#' @param gradient A numeric vector representing the gradient along which the taxa has been observed (e.g. concentration of nutrients).
#' @param taxa A character vector with the names of all taxa needs to be the same length at the gradient.
#' @param xlab An argument representing the name of the gradient.
#' @param nboot The number of bootstraps used in the analysis.
#' @param size Size of the point in plot "A". Default is 10 but might be to large.
#'
#' @details The function returns a list of three objects: a data frame with the Taxa with D an S, a data frame with S for all taxa and
#' a plotted figure (ggplot2).
#'
#' @export Dfun
#'
#' @examples
#' \dontrun{
#'library(readr)
#'df <- read_csv(url("https://raw.githubusercontent.com/snwikaij/Data/main/Aquatic_Botany_Kaijser_et_al._2019.csv"))
#'
#'#Run the analysis
#' results <- Dfun(df$Mediaan, df$Soort, size=6)
#'
#'#Display the results
#' results$Plot}
Dfun <- function(gradient, taxa, xlab="Gradient", nboot=1000, size=10){

  df <- cbind(gradient, as.character(taxa))

  bb_fun <- function(x){

    lx <- length(x)
    wx <- matrix(rexp(lx, 1), ncol = lx, byrow = TRUE)
    wx <- wx/rowSums(wx)

    bx     <- sample(x, size = lx, replace = T, prob = wx)

    return(bx)}

  cus_fun <- function(z){

    z[,2] <- as.numeric(z[,2])
    z     <- z[order(z[,2]),]
    z$V3  <- cumsum(1:nrow(z))/max(cumsum(1:nrow(z)))

    return(z)}

  looplist <- list()

  for(l in 1:nboot){

    print(l)

    splitdf     <- split(df[,1], df[,2])
    w           <- sapply(splitdf, function(x) bb_fun(x))
    y           <- list()
    for(i in 1:length(w)){y[[i]] <- cbind(names(w)[i], w[[i]])}

    v           <- do.call(rbind.data.frame,y)
    r           <- cus_fun(v)
    splitdfspec <- split(r, r$V1)

    q           <- lapply(splitdfspec, function(x) cbind(x[,1:2], cumsum(1:dim(x)[1])/max(cumsum(1:dim(x)[1]))))

    lpdf  <- data.frame(Taxa=rep(NA, length(q)), Distance=rep(NA, length(q)), Split=rep(NA, length(q)))
    count <- 0
    for(i in names(q)){
      count      <- count+1
      s          <- q[[i]]
      rownames(s)<-NULL
      n          <- r[r$V1==i,]
      Dl         <- n$V3-s[,3]
      m          <- abs(Dl)
      D          <- max(m)

      lpdf[count,] <- cbind(i, D*sign(Dl[which.max(m)]), s$V2[which.max(m)])
    }

    looplist[[l]] <- lpdf

  }

  f <- do.call(rbind, looplist)

  res               <- aggregate(data=f, .~Taxa, function(x) c(mean(as.numeric(x)), quantile(as.numeric(x), c(.025, 0.975))))
  results           <- cbind.data.frame(res$Taxa, res$Distance, res$Split)
  colnames(results) <- c("Taxa", "D", "D1", "D2", "S", "S1", "S2")
  results           <- cbind.data.frame(results[c("Taxa")], apply(results[-1], 2, as.numeric))
  pos               <- results[results$D >= 0,]
  neg               <- results[results$D < 0,]

  d                 <- rbind.data.frame(quantile(neg$S, c(.025, .975)), c(quantile(pos$S, c(.025, .975))))
  d                 <- setNames(cbind(c("Negative", "Positive"), c(mean(neg$S), mean(pos$S)), d), c("Group", "S", "S1", "S2"))
  rownames(d)       <- NULL
  d$Group           <- factor(d$Group, levels = c("Positive", "Negative"))

  pl1 <- ggplot(results, aes(x=S, y=reorder(Taxa, -S)))+
    ylab("")+xlab("")+
    geom_errorbarh(aes(xmin = S1, xmax = S2, height = 0))+
    geom_point(aes(fill = results$D), fill = ifelse(results$D > 0, "white", "black"), pch=21, size = abs(results$D)*size)+
    theme_classic()+
    theme(axis.line = element_line(colour = "black"),
          axis.text.y = element_text(colour = "black"),
          axis.title.y = element_text(colour = "black"),
          legend.position = "none")

  pl2 <- ggplot(d, aes(x=S, y=Group))+
    ylab("")+
    xlab(xlab)+
    xlim(ggplot_build(pl1)$layout$panel_scales_x[[1]]$range$range)+
    geom_errorbarh(aes(xmin = S1, xmax = S2, height = 0))+
    geom_point(fill = c("black", "white"), pch=21, size=6)+
    theme_classic()+
    theme(axis.line = element_line(colour = "black"),
          axis.text.y = element_text(colour = "black"),
          axis.title.y = element_text(colour = "black"),
          legend.position = "none")

  plot <- cowplot::plot_grid(pl1,pl2,labels = "AUTO", align = "v", ncol=1,
                             rel_heights = c(1.6, 0.4))

  return(list(Taxa=results, Binary=d, Plot=plot))}