R/densform.R

Defines functions .vecprob densform

Documented in densform

#' Generate standard probability density functions for each taxon/variable
#'
#' This function takes extracted climate data (from an object generated by the cRacle::extraction() function) for one taxon/species and generates probability density functions for variable using both a Gaussian (normal) approximation and a Gaussian Kernel Density estimator.
#' @param ex An object derived from the extraction() function.
#' @param clim A raster object (see raster::raster() and raster::stack() documentation for reading raster files into R).
#' @param name A character string describing (preferably) the group for which PDFs are being constructed (i.e., a species binomial). If none is supplied, a value of column "tax" is selected as a default.
#' @param bw A bandwidth compatible with stats::density(). Options include "nrd", "nrd0", "ucv", "bcv", etc.. Default (and recommended) value is "nrd0".
#' @param kern Type of Kernel to smooth with. Recommend 'gaussian', 'optcosine', or 'epanechnikov'. See: stats::density for options.
#' @param n Number of equally spaced points at which the probability density is to be estimated. Defaults to 1024. A lower number increases speed but decreases resolution in the function. A higher number increases resolution at the cost of speed. Recommended values: 512, 1024, 2048, ....
#' @param manip Character string of 'reg' for intersectional likelihood, 'condi' for conditional likelihood statement.
#' @param from vector of starting points by variable. Default is the variable layer minimum.
#' @param to vector of ending points by variable. Default is the variable layer maximum.
#' @param clip A character string of value "range" or "95conf" or "99conf". Should the probability functions be clipped to either the empirical range or the 95 or 99 percent confidence interval?
#' @param bg.n If manip = 'condi'. How many background points PER OCCURRENCE record should be sampled. Default is 1000.
#' @param bg Optionally send a matrix of an extraction object to use as the background sample.
#' @export
#' @examples \dontrun{
#' #distr <- read.table('test_mat.txt', head=T, sep ="\t");
#' data(distr);
#' data(climondbioclim);
#' extr.raw = extraction(data=distr, clim= climondbioclim, schema='raw');
#' extr.sub = subset(extr.raw, extr.raw$tax == extr.raw[5,'tax']);
#' dens.sub = densform(extr.sub, clim = climondbioclim, bw = 'nrd0', n = 128, bg.n=25);
#' densplot(dens.sub, names(climondbioclim[[1]]));
#' }

densform <- function(ex, clim,
                     name = '', bw = "nrd0", kern = 'gaussian',

                     manip = 'condi', n = 1024,
                     from = 0, to = 0, clip = 0,
                     bg.n = 10, bg=NULL){
  #  kern = 'gaussian'
  cut = 0;
  # adjust = (512*60)/n;
  adjust = 1;
  condi = FALSE;
  if(manip == 'condi') {
    condi = TRUE; #print("Conditional Likelihood")
  }

  data = ex;
  if(name == ''){
    name = data[2,'tax'];
  };
  pi = 3.14159265359;
  extr.larr <- data;
  head = which(colnames(ex) %in% 'cells') - 1;
  phytoclim <- clim;
  larr.den <- data.frame();
  larr.den.x <- data.frame();
  larr.den.gauss <- data.frame();
  larr.mean <- data.frame();
  larr.sd <- data.frame();
  larr.w <- data.frame();
  eval <- data.frame();
  bg.eval = data.frame();
  ncoords = length(extr.larr$lon);
  if(condi==TRUE){
    dmatrix = matrix(ncol = ncoords,
                     nrow = ncoords);
    #  print("Getting distance matrix");
    if(is.null(bg)){
      ##for(xx in 1:ncoords){
      ##  for(yy in xx:ncoords){
      # dmatrix[xx,yy] <- cRacle:::.distance(extr.larr$lon[xx], extr.larr$lat[xx],
      #                         extr.larr$lon[yy], extr.larr$lat[yy]);
      ##    dmatrix[xx,yy] <- distance(extr.larr$lon[xx], extr.larr$lat[xx],
      #                                                                extr.larr$lon[yy], extr.larr$lat[yy]);


      ##  }
      ##}

      dmatrix = distance(as.matrix(extr.larr[,c('lon', 'lat')]),
                         as.matrix(extr.larr[,c('lon', 'lat')]));

      #NOTE: The background is selected from a radius around each occurrence
      #record within 5x the mean distance between all points in the sample.
      #This threshold is arbitrary and needs to be empirically tested, but does seem to work.
      bg.rad = 3*max(stats::na.omit(dmatrix));
      #	print(bg.rad)
      #  bg.rad = max(stats::na.omit(dmatrix));
      # print("before rad_bg");
      bg.ex <- rad_bg(cbind(extr.larr$lon, extr.larr$lat),
                      phytoclim,
                      radius = bg.rad,
                      n = bg.n)
    } else {
      bg.ex = bg;
    }
    #	 print("after")
  }


  for(i in 1:length(names(phytoclim))){
    fr <- raster::minValue(phytoclim[[i]]); ### At issue with from and to is reproducibility. If defined globally from the raster then it will always be compatible.
    t <- raster::maxValue(phytoclim[[i]]);
    if(condi == TRUE){
      bg.vec = bg.ex[,names(phytoclim[[i]])];

      bg.den <- stats::density(c(as.numeric(bg.vec), as.numeric(extr.larr[,names(phytoclim[[i]])])),
                               n = n, kernel = kern,
                               adjust = 1, from = fr,
                               to = t, bw = bw,
                               na.rm = TRUE);
      o.vec <- as.numeric(extr.larr[,names(phytoclim[[i]])]);
      weights = 1/.vecprob(o.vec, bg.den$x, bg.den$y)
      weights = weights/sum(stats::na.omit(weights)); #So the weights sum to 1. Could consider other scaling. i.e., by rank order
      # print(weights);
      o.vec <- cbind(o.vec, weights);
      o.vec = stats::na.omit(o.vec);
      # if(sum(o.vec[,2]) != 1){print(names(phytoclim[[i]]));print(o.vec);}

      #For the KDE PDF use the weights option in the stats::density function to estimate the conditional probability
      den <- stats::density(o.vec[,1],
                            n = n, kernel = kern, adjust = 1.2,
                            from = fr,  to = t, weights = o.vec[,2],
                            bw = bw, na.rm = TRUE);
      bg.mean <- mean(as.numeric(bg.ex[,names(phytoclim[[i]])]));
      bg.sd <- stats::sd(bg.ex[,names(phytoclim[[i]])]);
      x = extr.larr[,names(phytoclim[[i]])]

      #For the gaussian PDF estimates use the weighted mean and sd:
      mean <- stats::weighted.mean(as.numeric(o.vec[,1]), as.numeric(o.vec[,2]));
      sd <- sqrt(sum(weights * (x - mean)^2))
      rn <- length(extr.larr[,names(phytoclim[[i]])]);

      if(sd == 0 || is.na(sd) == "TRUE"){
        sd = 0.01;
      };
      for(num in 1:length(den$x)){
        eval[num,1] <- ((1/(sqrt((2*pi)
                                 *(sd^2)))
                         *(2.71828^(-1*((den$x[num] - mean)^2)
                                    /(2*sd^2)))));

      };

    } else {
      den <- stats::density(as.numeric(extr.larr[,names(phytoclim[[i]])]),
                            n = n, kernel = kern,
                            from = fr,  to = t,
                            bw = bw, na.rm = TRUE);
      mean <- mean(extr.larr[,names(phytoclim[[i]])]);
      sd <- stats::sd(extr.larr[,i+head+1]);
      rn <- length(extr.larr[,names(phytoclim[[i]])]);

      if(sd == 0 || is.na(sd) == "TRUE"){
        sd = 0.01;
      };
      for(num in 1:length(den$x)){
        eval[num,1] <- ((1/(sqrt((2*pi)*(sd^2)))*(2.71828^(-1*((den$x[num] - mean)^2)/(2*sd^2)))));
      };
    }

    ##Below is for clipping the PDFs to fit empirical ranges or inferred confidence intervals
    minci = 0;maxci=0;
    if(clip == "95conf"){
      t = stats::qt(0.975, rn-1);
      minci = mean-(t*sd);
      maxci = mean+(t*sd);
    }
    if(clip == "99conf"){

      t = stats::qt(0.995, rn-1);
      minci = mean-(t*sd);
      maxci = mean+(t*sd);
    }
    if(clip == "999conf"){

      t = stats::qt(0.9995, rn-1);
      minci = mean-(t*sd);
      maxci = mean+(t*sd);
    }
    if(clip == 'range'){
      minci = min(extr.larr[,names(phytoclim[[i]])]);
      maxci = max(extr.larr[,names(phytoclim[[i]])]);
    }
    if(minci == 0 & maxci==0){} else{

      if(den$x[[1]] < minci){
        larr.den[1, i] = 0;
        eval[1,1] =0;
      }
      if(den$x[[1]] > maxci){
        larr.den[1, i] = 0;
        eval[1,1] = 0;
      }

      for(zz in 2:length(den$x)){

        if(den$x[[zz]] < minci){
          larr.den[(zz-1), i] = 0;
          eval[(zz-1),1] =0;
        }
        if(den$x[[zz]] > maxci){
          larr.den[(zz-1), i] = 0;
          eval[(zz-1),1] = 0;
        }
      }
    }
    larr.den.x[1:n, i] <- den$x;
    larr.den[1:n, i] <- den$y;

    larr.den.gauss[1:n, i] <- eval[,1];
    larr.mean[1,i] <- mean;
    larr.sd[1,i] <- sd;
    range <- max(larr.den.x[,i]) - min(larr.den.x[,i]);
    #print(sd); print(range);
    w <- sd/range;
    w = 1/w;
    larr.w[1,i] = w;
    #weight = as.numeric(larr.w[1,])
    # weight =  weight - (min(weight));
    # weight = weight/(0.5*max(weight));
    #larr.w[1,] = weight;
  }
  larr.w[1,] = larr.w[1,]/(0.5*max(larr.w[,1]));
  colnames(larr.den.gauss) <- c(paste(names(phytoclim), "gauss", sep = "."));
  colnames(larr.mean) <- c(paste(names(phytoclim), "mean", sep = "."));
  colnames(larr.sd) <- c(paste(names(phytoclim), "sd", sep = "."));
  colnames(larr.w) <- c(paste(names(phytoclim), "w", sep = "."));

  colnames(larr.den) <- c(paste(names(phytoclim), "kde", sep = "."));
  colnames(larr.den.x) <- c(paste(names(phytoclim), "x", sep = "."));
  name = data.frame(name);
  larr.mean = data.frame(larr.mean);
  larr.sd = data.frame(larr.sd);
  colnames(name) <- "name";
  fin <- c(larr.den, larr.den.x, larr.den.gauss, larr.mean, larr.sd, larr.w, name);
  fin <- .makeaucone(fin);
  return(fin);

};

#densform = compiler::cmpfun(densform);

.vecprob <- function(search, x,y){
  to <- max(x);
  from <- min(x);
  num <- length(x);
  by = (to - from)/num;
  bin = floor((search - from) / by)+1;

  if(length(bin)>1){
    for(nn in 1:length(bin)){
      if (is.na(bin[[nn]])) { bin[[nn]] = 1; }
      if (bin[[nn]] <= 1) { bin[[nn]] = 1; }
      if (bin[[nn]] > num) { bin[[nn]] = num; }
    }
  } else{
    if (bin <= 1){bin =1;}
    if (bin > num){
      bin = num
    }
  }
  ret = y[bin]*by;

  # ret[ret == NA] <- min(stats::na.omit(ret));
  ret[ret== -Inf] <- NA;
  ret[is.na(search)] = NA;
  return(ret);
}
rsh249/CRACLE documentation built on Feb. 2, 2022, 11:31 p.m.