R/combinedGradientForest.r

Defines functions `combinedGradientForest`

`combinedGradientForest` <-
function(..., nbin=101, method=2, standardize=c("before","after")[1])
{
    std.options <- c("before","after")
    if (is.na(std.option <- pmatch(standardize,std.options)))
      stop(paste('Unmatched standardize value "',standardize,'". Expecting "before" or "after"',sep=""))

    fList <- list(...)
    ngear <- length(fList)
    if(!all(sapply(fList,inherits,"gradientForest")))
      stop("Every argument must be a gradientForest")
#
#   assign forest names
    if(is.null(gearnames <- names(fList)))
      gearnames <- paste("F",1:ngear,sep="")
    if (any(empty <- gearnames==""))
      gearnames[empty] <- paste("F",1:ngear,sep="")[empty]
    names(fList) <- gearnames
#
#   check the predictor names are the same
    npred <- sapply(fList, function(obj) ncol(obj$X))
    nsite <- sapply(fList, function(obj) nrow(obj$X))
#    if(!all(npred == npred[1]))
#      stop("Every forest must have the same number of predictors")
    prednames  <- lapply(fList, function(obj) levels(obj$res$var))
    allpreds <- as.character(unique(sort(unlist(prednames))))
#   find gradientForest objects that support each predictor
    gf.names <- lapply(namenames(allpreds), function(predictor) gearnames[sapply(prednames, is.element, el=predictor)])
#    if(!all(prednames == prednames[,1]))
#      stop("Every forest must have the same predictors")
#    prednames <- prednames[,1]
#    npred <- npred[1]
#
#   combined predictor matrix and common bins for importance curve grid
    create.df.aux <- function(X) as.data.frame(do.call("cbind",lapply(namenames(allpreds), function(pred) {res <- X[[pred]]; if(is.null(res)) rep(0,nrow(X)) else res})))
    create.df <- function(X,transpose=F) {
      if (transpose) X <- as.data.frame(t(X))
      X <- create.df.aux(X)
      if (transpose) X <- as.data.frame(t(X))
      X
    }

####Need to create X_r. Need gf.name, value, predictor, nspec.
####nspec can wait
    ####value and predictor come from gf object X, but we only want predictors used by the gf obj. which is unique(obj$res$var)
    ####Need to do it for each gf, then rbind
    gf.x.long.form <- function(gearname, gf) {
      preds <- as.character(unique(gf$res$var))
      out <- data.frame(gf.name = gearname, stack(gf$X[preds]))
      names(out) <- c("gf.name","value","predictor")
      return(out)
    }
    gf.x.long.form.missed <- function(gearname, gf, preds) {
      out <- data.frame(gf.name = gearname, stack(gf$X[preds]))
      names(out) <- c("gf.name","value","predictor")
      return(out)
    }

    X_r <- do.call("rbind", lapply(gearnames, function(gearname) {gf.x.long.form(gearname, gf = fList[[gearname]])}))
    ## preds that are not used by any GF model will
    ## have a cumimp of 0.
    ## however, a number of processing steps expect to
    ## use the predictor range, so the simplest solution
    ## is to add the predictor back in.
    missed_preds <- setdiff(allpreds, levels(X_r$predictor))
    if(length(missed_preds) > 0) {
        rescued_preds <- do.call("rbind", lapply(gearnames, function(gearname) {gf.x.long.form.missed(gearname, gf = fList[[gearname]], preds = missed_preds)}))
        X_r <- rbind(X_r, rescued_preds)
    }    
    nspec <- sapply(fList,"[[","species.pos.rsq")
    X_r$nspec <- nspec[X_r$gf.name]
    X_r <- na.omit(X_r)

    ##The purpose of bins is to make sure the range of
    ##each predictor covers the range of samples that
    ##were meaningful. If a model was fitted with high
    ## values of a predictor, but didn't use the predictor,
    ## then we want to focus on the narrower range of
    ## values that other models did use.
    ## If NO model uses a predictor, then the range we
    ## use is not as important, all models will assign
    ## a cumulative importance of 0.
    
    bins <- tapply(1:nrow(X_r), X_r$predictor, function(sub){
      bin(X_r$value[sub], nbin=nbin)
    })
    bins <- do.call("cbind", bins)


    imp.rsq.long <- do.call("rbind", lapply(gearnames, function(gf, fList) {
      resp.names <- colnames(fList[[gf]]$imp.rsq)
      imp.rsq.tmp <- data.frame(predictor = row.names(fList[[gf]]$imp.rsq), fList[[gf]]$imp.rsq)
      colnames(imp.rsq.tmp)[-1] <- resp.names
      out <- stats::reshape(imp.rsq.tmp,
        idvar = "predictor",
        varying = resp.names, v.names = "imp",
        times = resp.names, direction = "long",
        timevar = "resp"
      )
      out$resp <- sub("^", paste0(gf, "."), out$resp)
      return(out)
    }, fList = fList))
    imp.rsq <- stats::reshape(imp.rsq.long,
                              v.names = "imp",
                              timevar = "resp",
                              idvar = "predictor",
direction = "wide"
                              )
    imp.rsq[is.na(imp.rsq)] <- 0
    names(imp.rsq) <- sub("^imp.", "", names(imp.rsq))
    row.names(imp.rsq) <- imp.rsq$predictor
    imp.rsq.total <- do.call("cbind", lapply(gearnames, function(gn){
      gear.cols <- grep(pattern = paste0("^", gn, "."), names(imp.rsq))
      out <- data.frame(x = rowSums(imp.rsq[gear.cols]))
      names(out) <- gn
      return(out)
    }))
    names(imp.rsq.total) <- gearnames

    ####imp.rsq is now working, any combination of used preditors is allowed, but row and column names are broken.
    ####combinations of predictor and species that have not been seen have an R^2 importance of 0

#### To switch to allowing different input predictors:
    ####Generate X_r more directly,
####Get bins from X_r, using tapply (base R for operating on groups/subsets)
    ####imp.rsq? Absent pred-species combinations to 0?

    ###Allow different input predictors
    ####gf.names gives mapping between GF object and predictor set
    ####X is making a giant data.frame of all samples, labelled by GF
    X <- lapply(gearnames, function(a) {cbind(gf.name=a,create.df(fList[[a]]$X))})
    names(X) <- gearnames
    ## X <- do.call("rbind",lapply(gearnames, function(a) {cbind(gf.name=a,create.df(fList[[a]]$X))})) #
#    X <- do.call("rbind",lapply(gearnames, function(a) {cbind(gf.name=a,fList[[a]]$X)}))
#
    ####Bins is using each pred separately, and creating nbin equally spaced bins using the max and min
    ## bins <- do.call("cbind",lapply(X[allpreds], function(x) bin(x,nbin=nbin)))
    ####Importance, of each predictor for each species, pred by sp.
    ## imp.rsq.list <- lapply(fList, function(x) create.df(x$imp.rsq,transpose=T))
    ## ####cbind, pred by sp for all sp
    ## imp.rsq <- do.call("cbind",imp.rsq.list)
    ####Overall R^2 for each species across all surveys
    rsq <- unlist(lapply(fList, function(x) x$result))
#
#   combined density calculation
    ####X_r is long form of all predictors, with survey as a label column
    ## X_r <- cbind(X[,1], stack(X[allpreds]))
    ## names(X_r) <- c("gf.name","value","predictor")
    ## ####Species per survey, as a vector
    ## nspec <- sapply(fList,"[[","species.pos.rsq")
    ## X_r$nspec <- nspec[X_r$gf.name]
    ## X_r <- na.omit(X_r)
    ####dens gives density for each predictor in combined GF, using long form
    dens <- with(X_r,tapply(1:nrow(X_r),predictor,function(sub) {
      whiten(density(value[sub],weight=nspec[sub]/sum(nspec[sub])),lambda=0.95)
    }))
    dens <- c(list(Combined=dens),lapply(fList, function(x) x$dens))
#
#   Gather the overall cumulative importances from each gradientForest
#   Combine cumulative importances
#   Normalize relative to combined importance
#
    gridded.cumulative.importance <- function(obj, predictor) {
      cu <- cumimp(obj, predictor=predictor, standardize_after=(std.options[std.option]=="after"))
      grid <- bins[,predictor]
      if (length(cu$x)==1)
        y <- approx(cu$x,cu$y,grid,rule=2,method="constant",yleft=0)$y
      else y <- approx(cu$x,cu$y,grid,rule=2,method="linear")$y
      list(x=grid, y=y)
    }
#
#   Linear interpolation to grid
#   Density outside survey is zero
#
    interpolate <- function(xy, grid) {
      res <- approx(xy$x,xy$y,grid,rule=1,method="linear")$y
      res[is.na(res)] <- 0
      res
    }

    CU <- lapply(namenames(allpreds), function(predictor)
      lapply(fList[gf.names[[predictor]]], gridded.cumulative.importance, predictor=predictor))
    rsq.total <- sapply(lapply(fList,"[[","result"),sum)
    ## imp.rsq.total <- sapply(imp.rsq.list,rowSums,na.rm=TRUE)
    for (predictor in allpreds) {
      g <- gf.names[[predictor]]
      weight <- as.matrix(rbind(
        uniform = rep(1,length(g)), 
        species = nspec[g], 
        rsq.total = imp.rsq.total[predictor,g],
        rsq.mean = imp.rsq.total[predictor,g]/nspec[g],
        site = nsite[g], 
        site.species = nsite[g]*nspec[g], 
        site.rsq.total = nsite[g]*imp.rsq.total[predictor,g],
        site.rsq.mean = nsite[g]*imp.rsq.total[predictor,g]/nspec[g]
      ))
      densList <- lapply(dens[c("Combined",g)],"[[",predictor) # list of densities, combined version first
      grid <- bins[,predictor]
      densMat <- sapply(densList, interpolate, grid=grid)
      CU[[predictor]][["density"]] <- list(x=grid,y=densMat)
      if (method==2) {
        CUmat <- combine.cumulative.importance(CU[[predictor]][g], densMat, grid, weight)
      } else if (method==1) {
        imp <- rowMeans(imp.rsq)[predictor]
        CUmat <- combine.cumulative.importance.method1(CU[[predictor]][g], densMat, grid, weight, imp)
      } else stop(paste("Unknown method:",method))
      for (i in rownames(weight))
        CU[[predictor]][[paste("combined",i,sep=".")]] <- list(x=grid,y=CUmat[,i])
    }

    out <- list(
      call = match.call(),
      X = X, #TODO: X is now a list of site by pred matrices, one per GF object. Future functions expecting X to be a global site by pred matrix will fail. I expect predict to fail if no new env is provided
      dens = dens,
      imp.rsq = imp.rsq,
      rsq = rsq,
      nspec = nspec,
      CU = CU,
      gf.names = gf.names
      )
    class(out) <- c("combinedGradientForest","list")
    out
  }

Try the gradientForest package in your browser

Any scripts or data that you put into this service are public.

gradientForest documentation built on Aug. 24, 2023, 3:03 p.m.