R/combinedGradientForest.r
In gradientForest: Random Forest functions for the Census of Marine Life synthesis project.

`combinedGradientForest` <-
function(..., nbin=101, method=2, standardize=c("before","after")[1])
{
    std.options <- c("before","after")
    if (is.na(std.option <- pmatch(standardize,std.options)))
      stop(paste('Unmatched standardize value "',standardize,'". Expecting "before" or "after"',sep=""))

    fList <- list(...)
    ngear <- length(fList)
    if(!all(sapply(fList,inherits,"gradientForest")))
      stop("Every argument must be a gradientForest")
#
#   assign forest names
    if(is.null(gearnames <- names(fList)))
      gearnames <- paste("F",1:ngear,sep="")
    if (any(empty <- gearnames==""))
      gearnames[empty] <- paste("F",1:ngear,sep="")[empty]
    names(fList) <- gearnames
#
#   check the predictor names are the same
    npred <- sapply(fList, function(obj) ncol(obj$X))
    nsite <- sapply(fList, function(obj) nrow(obj$X))
#    if(!all(npred == npred[1]))
#      stop("Every forest must have the same number of predictors")
    prednames  <- lapply(fList, function(obj) levels(obj$res$var))
    allpreds <- as.character(unique(sort(unlist(prednames))))
#   find gradientForest objects that support each predictor
    gf.names <- lapply(namenames(allpreds), function(predictor) gearnames[sapply(prednames, is.element, el=predictor)])
#    if(!all(prednames == prednames[,1]))
#      stop("Every forest must have the same predictors")
#    prednames <- prednames[,1]
#    npred <- npred[1]
#
#   combined predictor matrix and common bins for importance curve grid
    create.df.aux <- function(X) as.data.frame(do.call("cbind",lapply(namenames(allpreds), function(pred) {res <- X[[pred]]; if(is.null(res)) rep(0,nrow(X)) else res})))
    create.df <- function(X,transpose=F) {
      if (transpose) X <- as.data.frame(t(X))
      X <- create.df.aux(X)
      if (transpose) X <- as.data.frame(t(X))
      X
    }

####Need to create X_r. Need gf.name, value, predictor, nspec.
####nspec can wait
    ####value and predictor come from gf object X, but we only want predictors used by the gf obj. which is unique(obj$res$var)
    ####Need to do it for each gf, then rbind
    gf.x.long.form <- function(gearname, gf) {
      preds <- as.character(unique(gf$res$var))
      out <- data.frame(gf.name = gearname, stack(gf$X[preds]))
      names(out) <- c("gf.name","value","predictor")
      return(out)
    }
    gf.x.long.form.missed <- function(gearname, gf, preds) {
      out <- data.frame(gf.name = gearname, stack(gf$X[preds]))
      names(out) <- c("gf.name","value","predictor")
      return(out)
    }

    X_r <- do.call("rbind", lapply(gearnames, function(gearname) {gf.x.long.form(gearname, gf = fList[[gearname]])}))
    ## preds that are not used by any GF model will
    ## have a cumimp of 0.
    ## however, a number of processing steps expect to
    ## use the predictor range, so the simplest solution
    ## is to add the predictor back in.
    missed_preds <- setdiff(allpreds, levels(X_r$predictor))
    if(length(missed_preds) > 0) {
        rescued_preds <- do.call("rbind", lapply(gearnames, function(gearname) {gf.x.long.form.missed(gearname, gf = fList[[gearname]], preds = missed_preds)}))
        X_r <- rbind(X_r, rescued_preds)
    }    
    nspec <- sapply(fList,"[[","species.pos.rsq")
    X_r$nspec <- nspec[X_r$gf.name]
    X_r <- na.omit(X_r)

    ##The purpose of bins is to make sure the range of
    ##each predictor covers the range of samples that
    ##were meaningful. If a model was fitted with high
    ## values of a predictor, but didn't use the predictor,
    ## then we want to focus on the narrower range of
    ## values that other models did use.
    ## If NO model uses a predictor, then the range we
    ## use is not as important, all models will assign
    ## a cumulative importance of 0.
    
    bins <- tapply(1:nrow(X_r), X_r$predictor, function(sub){
      bin(X_r$value[sub], nbin=nbin)
    })
    bins <- do.call("cbind", bins)


    imp.rsq.long <- do.call("rbind", lapply(gearnames, function(gf, fList) {
      resp.names <- colnames(fList[[gf]]$imp.rsq)
      imp.rsq.tmp <- data.frame(predictor = row.names(fList[[gf]]$imp.rsq), fList[[gf]]$imp.rsq)
      colnames(imp.rsq.tmp)[-1] <- resp.names
      out <- stats::reshape(imp.rsq.tmp,
        idvar = "predictor",
        varying = resp.names, v.names = "imp",
        times = resp.names, direction = "long",
        timevar = "resp"
      )
      out$resp <- sub("^", paste0(gf, "."), out$resp)
      return(out)
    }, fList = fList))
    imp.rsq <- stats::reshape(imp.rsq.long,
                              v.names = "imp",
                              timevar = "resp",
                              idvar = "predictor",
direction = "wide"
                              )
    imp.rsq[is.na(imp.rsq)] <- 0
    names(imp.rsq) <- sub("^imp.", "", names(imp.rsq))
    row.names(imp.rsq) <- imp.rsq$predictor
    imp.rsq.total <- do.call("cbind", lapply(gearnames, function(gn){
      gear.cols <- grep(pattern = paste0("^", gn, "."), names(imp.rsq))
      out <- data.frame(x = rowSums(imp.rsq[gear.cols]))
      names(out) <- gn
      return(out)
    }))
    names(imp.rsq.total) <- gearnames

    ####imp.rsq is now working, any combination of used preditors is allowed, but row and column names are broken.
    ####combinations of predictor and species that have not been seen have an R^2 importance of 0

#### To switch to allowing different input predictors:
    ####Generate X_r more directly,
####Get bins from X_r, using tapply (base R for operating on groups/subsets)
    ####imp.rsq? Absent pred-species combinations to 0?

    ###Allow different input predictors
    ####gf.names gives mapping between GF object and predictor set
    ####X is making a giant data.frame of all samples, labelled by GF
    X <- lapply(gearnames, function(a) {cbind(gf.name=a,create.df(fList[[a]]$X))})
    names(X) <- gearnames
    ## X <- do.call("rbind",lapply(gearnames, function(a) {cbind(gf.name=a,create.df(fList[[a]]$X))})) #
#    X <- do.call("rbind",lapply(gearnames, function(a) {cbind(gf.name=a,fList[[a]]$X)}))
#
    ####Bins is using each pred separately, and creating nbin equally spaced bins using the max and min
    ## bins <- do.call("cbind",lapply(X[allpreds], function(x) bin(x,nbin=nbin)))
    ####Importance, of each predictor for each species, pred by sp.
    ## imp.rsq.list <- lapply(fList, function(x) create.df(x$imp.rsq,transpose=T))
    ## ####cbind, pred by sp for all sp
    ## imp.rsq <- do.call("cbind",imp.rsq.list)
    ####Overall R^2 for each species across all surveys
    rsq <- unlist(lapply(fList, function(x) x$result))
#
#   combined density calculation
    ####X_r is long form of all predictors, with survey as a label column
    ## X_r <- cbind(X[,1], stack(X[allpreds]))
    ## names(X_r) <- c("gf.name","value","predictor")
    ## ####Species per survey, as a vector
    ## nspec <- sapply(fList,"[[","species.pos.rsq")
    ## X_r$nspec <- nspec[X_r$gf.name]
    ## X_r <- na.omit(X_r)
    ####dens gives density for each predictor in combined GF, using long form
    dens <- with(X_r,tapply(1:nrow(X_r),predictor,function(sub) {
      whiten(density(value[sub],weight=nspec[sub]/sum(nspec[sub])),lambda=0.95)
    }))
    dens <- c(list(Combined=dens),lapply(fList, function(x) x$dens))
#
#   Gather the overall cumulative importances from each gradientForest
#   Combine cumulative importances
#   Normalize relative to combined importance
#
    gridded.cumulative.importance <- function(obj, predictor) {
      cu <- cumimp(obj, predictor=predictor, standardize_after=(std.options[std.option]=="after"))
      grid <- bins[,predictor]
      if (length(cu$x)==1)
        y <- approx(cu$x,cu$y,grid,rule=2,method="constant",yleft=0)$y
      else y <- approx(cu$x,cu$y,grid,rule=2,method="linear")$y
      list(x=grid, y=y)
    }
#
#   Linear interpolation to grid
#   Density outside survey is zero
#
    interpolate <- function(xy, grid) {
      res <- approx(xy$x,xy$y,grid,rule=1,method="linear")$y
      res[is.na(res)] <- 0
      res
    }

    CU <- lapply(namenames(allpreds), function(predictor)
      lapply(fList[gf.names[[predictor]]], gridded.cumulative.importance, predictor=predictor))
    rsq.total <- sapply(lapply(fList,"[[","result"),sum)
    ## imp.rsq.total <- sapply(imp.rsq.list,rowSums,na.rm=TRUE)
    for (predictor in allpreds) {
      g <- gf.names[[predictor]]
      weight <- as.matrix(rbind(
        uniform = rep(1,length(g)), 
        species = nspec[g], 
        rsq.total = imp.rsq.total[predictor,g],
        rsq.mean = imp.rsq.total[predictor,g]/nspec[g],
        site = nsite[g], 
        site.species = nsite[g]*nspec[g], 
        site.rsq.total = nsite[g]*imp.rsq.total[predictor,g],
        site.rsq.mean = nsite[g]*imp.rsq.total[predictor,g]/nspec[g]
      ))
      densList <- lapply(dens[c("Combined",g)],"[[",predictor) # list of densities, combined version first
      grid <- bins[,predictor]
      densMat <- sapply(densList, interpolate, grid=grid)
      CU[[predictor]][["density"]] <- list(x=grid,y=densMat)
      if (method==2) {
        CUmat <- combine.cumulative.importance(CU[[predictor]][g], densMat, grid, weight)
      } else if (method==1) {
        imp <- rowMeans(imp.rsq)[predictor]
        CUmat <- combine.cumulative.importance.method1(CU[[predictor]][g], densMat, grid, weight, imp)
      } else stop(paste("Unknown method:",method))
      for (i in rownames(weight))
        CU[[predictor]][[paste("combined",i,sep=".")]] <- list(x=grid,y=CUmat[,i])
    }

    out <- list(
      call = match.call(),
      X = X, #TODO: X is now a list of site by pred matrices, one per GF object. Future functions expecting X to be a global site by pred matrix will fail. I expect predict to fail if no new env is provided
      dens = dens,
      imp.rsq = imp.rsq,
      rsq = rsq,
      nspec = nspec,
      CU = CU,
      gf.names = gf.names
      )
    class(out) <- c("combinedGradientForest","list")
    out
  }

Any scripts or data that you put into this service are public.

gradientForest documentation built on Aug. 24, 2023, 3:03 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

gradientForest
Random Forest functions for the Census of Marine Life synthesis project.

R/combinedGradientForest.r
In gradientForest: Random Forest functions for the Census of Marine Life synthesis project.

Defines functions `combinedGradientForest`

Try the gradientForest package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gradientForest Random Forest functions for the Census of Marine Life synthesis project.

R/combinedGradientForest.r In gradientForest: Random Forest functions for the Census of Marine Life synthesis project.

Defines functions `combinedGradientForest`

Try the gradientForest package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gradientForest
Random Forest functions for the Census of Marine Life synthesis project.

R/combinedGradientForest.r
In gradientForest: Random Forest functions for the Census of Marine Life synthesis project.