R/ensemble.environmentalThin.R

Defines functions `ensemble.environmentalThin.clara` `ensemble.environmentalThin`

`ensemble.environmentalThin` <- function(
    x, predictors.stack=NULL, extracted.data=NULL,
    thin.n=50, runs=100, pca.var=0.95, 
    silent=FALSE, verbose=FALSE,
    return.notRetained=FALSE
) 
{

    .BiodiversityR <- new.env()

# distEnviro.thin operates on stacked distances (do not recalculate distance each run)
    'distEnviro.thin' <- function(x, x2, thin.dist=0.1, thin.n=50) {
# Algorithm modified in MAR-2022 to always reach the minimum distance, but
# in case fewer locations were retained, use a previous step from the removal process      
        x2.c <- x2
        while(min(x2.c[, 3]) < thin.dist && nrow(x2.c) > 1) {
            retained.c <- unique(c(x2.c[, 1], x2.c[, 2]))
            if (length(retained.c) >= thin.n) {x2.prev <- x2.c}
            p <- nrow(x2.c)
            x2.c <- x2.c[sample(p), ]
            first.min <- which(x2.c[, 3] < thin.dist)
            first.min <- first.min[1]
            random.col <- as.numeric(runif(1) > 0.5)+1
            selected <- x2.c[first.min, random.col]
            rows1 <- x2.c[, 1] != selected
            x2.c <- x2.c[rows1, , drop=F]
            rows2 <- x2.c[, 2] != selected
            x2.c <- x2.c[rows2, , drop=F]
        }
        retained.c <- unique(c(x2.c[, 1], x2.c[, 2]))
        if (length(retained.c) >= thin.n) {
            x2 <- x2.c
        }else{
            x2 <- x2.prev
        }
        
        retained <- unique(c(x2[, 1], x2[, 2]))
        x3 <- x[retained, ]
        
        # special case where the remaining 2 locations are closer than minimum distance
        if (nrow(x3)==2 && x2[1, 3] < thin.dist) {
            retained <- sample(retained, size=1)
            x3 <- x[retained, , drop=F]
        }
        # Modified MAR-2022!
        # In case a larger number of locations was retained, use the first algorithm again
        retained <- unique(c(x2[, 1], x2[, 2]))
        retained.n <- length(unique(c(x2[, 1], x2[, 2])))
        
        while(retained.n > thin.n) {
            first.min <- which.min(x2[, 3])
            first.min <- first.min[1]
            random.col <- as.numeric(runif(1) > 0.5)+1
            selected <- x2[first.min, random.col]
            rows1 <- x2[, 1] != selected
            x2 <- x2[rows1, , drop=F]
            rows2 <- x2[, 2] != selected
            x2 <- x2[rows2, , drop=F]
            retained <- unique(c(x2[, 1], x2[, 2]))
            retained.n <- length(retained)
        }
        x3 <- x[retained, ]
        dist.min <- min(x2[, 3])
        return(list(x3=x3, dist.min=dist.min, retained=retained))
    }

    if (verbose == TRUE) {silent <- FALSE}

    if(thin.n >= nrow(x)) {
        if (silent == F) {
            cat(paste("WARNING: thinning parameter larger or equal to number of available locations", "\n"))
            cat(paste("therefore all locations selected", "\n\n"))
        }
        if (return.notRetained == TRUE) {
            return(list(retained=x, not.retained=NULL))
        }else{
            return(x)
        }
    }        

# create background data
    if (is.null(extracted.data) == TRUE) {
        background.data <- raster::extract(predictors.stack, x)
        background.data <- data.frame(background.data)
    }else{
        if (nrow(x) != nrow(extracted.data)) {stop("WARNING: different row numbers of coordinates and extracted.data")}
        background.data <- extracted.data
    }
        
    TrainValid <- complete.cases(background.data)
    x <- x[TrainValid,]
    background.data <- background.data[TrainValid,]

# PCA of scaled variables
    rda.result <- vegan::rda(X=background.data, scale=TRUE)
# select number of axes
    ax <- 1
    while ( (sum(vegan::eigenvals(rda.result)[c(1:ax)])/sum(vegan::eigenvals(rda.result))) < pca.var ) {ax <- ax+1}
    if (silent == FALSE) {cat(paste("\n", "Percentage of variance of the selected axes (1 to ", ax, ") of principal components analysis: ", 100*sum(vegan::eigenvals(rda.result)[c(1:ax)])/sum(vegan::eigenvals(rda.result)), "\n", sep = ""))}
    rda.scores <- vegan::scores(rda.result, display="sites", scaling=1, choices=c(1:ax))
    rda.dist <- as.matrix(vegdist(rda.scores, method="euc"))
    rda.dist <- signif(rda.dist, digits=6)
#
# stack
    n <- nrow(x)
    pairs <- utils::combn(n, 2)
    p <- ncol(pairs)
    pairs <- cbind(t(pairs), numeric(p))
    for (i in 1:p) {
        pairs[i, 3] <- rda.dist[pairs[i, 1], pairs[i, 2]]
    }
#
    runs <- max(runs, 1)
    dist.all <- 0
#
# algorithm 1 removes from pairs with smallest distance
    x2 <- pairs
    retained.n <- length(unique(c(x2[, 1], x2[, 2])))
    while(retained.n > thin.n) {
        first.min <- which.min(x2[, 3])
        first.min <- first.min[1]
        random.col <- as.numeric(runif(1) > 0.5)+1
        selected <- x2[first.min, random.col]
        rows1 <- x2[, 1] != selected
        x2 <- x2[rows1, , drop=F]
        rows2 <- x2[, 2] != selected
        x2 <- x2[rows2, , drop=F]
        retained <- unique(c(x2[, 1], x2[, 2]))
        retained.n <- length(retained)
    }
    x3 <- x[retained, ]
    dist.min1 <- min(x2[, 3])
    if (silent == FALSE) {
        cat(paste("Environmentally thinned point location data set obtained with first algorithm", "\n", sep=""))
        cat(paste("number of locations: ", nrow(x3), "\n"))
        cat(paste("minimum distance: ", dist.min1, "\n"))
    }
#
# algorithm 2 uses minimum distance of previous algorithm
# now algorithm attempts to maximize the number of retained locations, similar to ensemble.spatialThin
#
    if (silent == F) {cat(paste("\n", "Environmentally thinned point location data set obtained with second algorithm", "\n", sep=""))}
    dist.all <- 0
    dist.n.all <- 0
    for (i in 1:runs) {
        dist1 <- distEnviro.thin(x, x2=pairs, thin.dist=dist.min1, thin.n=thin.n)
        dist.min2 <- dist1$dist.min
        dist.n <- nrow(dist1$x3)
        if (verbose == T) {
            if (dist.min2 > dist.all) {cat(paste("run ", i, " (", dist.n, " locations with minimum distance: ", dist.min2, " > ", dist.all, " [previous minimum distance])", "\n", sep=""))}
            if (dist.min2 == dist.all) {cat(paste("run ", i, " (", dist.n, " locations with minimum distance: ", dist.min2, " = ", dist.all, " [previous minimum distance])", "\n", sep=""))}
            if (dist.min2 < dist.all) {cat(paste("run ", i, " (", dist.n, " locations with minimum distance: ", dist.min2, ")", "\n", sep=""))}
        }
        if (dist.min2 > dist.all) { 
            dist.all <- dist.min2
            loc.out <- dist1$x3
            dist.n.all <- dist.n
            retained <- dist1$retained
        }
        if (dist.min2 == dist.all && dist.n > dist.n.all) { 
            dist.all <- dist.min2
            loc.out <- dist1$x3
            dist.n.all <- dist.n
            retained <- dist1$retained
        }
    }
    if (verbose == TRUE) {cat(paste("\n"))}
    if (silent == FALSE) {
        cat(paste("number of locations: ", nrow(loc.out), "\n"))
        cat(paste("minimum distance: ", dist.all, "\n"))
    }
    if (return.notRetained == TRUE) {
        x.not <- x[(c(1:nrow(x)) %in% retained) == FALSE, ]
        return(list(retained=loc.out, not.retained=x.not))
    }else{
        return(loc.out)
    }
}


`ensemble.environmentalThin.clara` <- function(
    x, predictors.stack=NULL, thin.n=20, runs=100, pca.var=0.95, 
    silent=FALSE, verbose=FALSE,
    clara.k=100
) 
{
#
# create background data
    background.data <- raster::extract(predictors.stack, x)
    background.data <- data.frame(background.data)
    TrainValid <- complete.cases(background.data)
    x <- x[TrainValid,]
    background.data <- background.data[TrainValid,]

# PCA of scaled variables
    rda.result <- vegan::rda(X=background.data, scale=T)
# select number of axes
    ax <- 1
    while ( (sum(vegan::eigenvals(rda.result)[c(1:ax)])/sum(vegan::eigenvals(rda.result))) < pca.var ) {ax <- ax+1}
    if (silent == F) {cat(paste("\n", "Percentage of variance of the selected axes (1 to ", ax, ") of principal components analysis: ", 100*sum(vegan::eigenvals(rda.result)[c(1:ax)])/sum(vegan::eigenvals(rda.result)), "\n", sep = ""))}
    rda.scores <- vegan::scores(rda.result, display="sites", scaling=1, choices=c(1:ax))
#
# cluster and thin if more locations in each cluster
    clara.result <- cluster::clara(rda.scores, k=clara.k, metric="euclidean", medoids.x=F)$clustering
#
    loc.first <- TRUE
    for (lo in 1:clara.k) {
        x.bin <- x[clara.result == lo,  , drop=F]
        x.env <- x.bin
        if (nrow(as.data.frame(x.bin)) > thin.n) {
            x.env <- ensemble.environmentalThin(x=x.bin, predictors.stack=predictors.stack, thin.n=thin.n, runs=runs, pca.var=pca.var, verbose=verbose, silent=silent)
        }
        if (loc.first == T) {
            x.out <- x.env
            loc.first <- FALSE
        }else{
            x.out <- rbind(x.out, x.env)
        }
    }
    return(x.out)
}

Try the BiodiversityR package in your browser

Any scripts or data that you put into this service are public.

BiodiversityR documentation built on Oct. 22, 2023, 5:06 p.m.