#' Prepare dataset to be included in the model selection procedure
#'
#' This specifies data, covariates and allows resampling dataset in a regular grid for
#' autocorrelation purposes
#'
#' @param x data.frame or SpatialPointsDataFrame of observations with covariates
#' @param coords name or number of column with coordinates x,y.
#' Or a 2-column matrix of x-y coordinates. If missing, output object will be
#' a simple data.frame except if x is SpatialPointsDataFrame.
#' @param proj4string projection of the dataset. Default to x projection if exist or
#' '+proj=longlat +datum=WGS84'.
#' @param var column name or number of variable to be predicted
#' @param cov column name or numbers to be used as covariates
#' @param RefRaster raster as in library(raster). Raster used as smallest grid for data
#' gridding procedure. Resampling dataset into regular grid to decrease
#' spatial-autocorrelation. See details.
#' @param datatype string. Choose among "Cont" (continuous data, even if only positive),
#' "PA" (presence-absence data) or "Count" (count data). Required when RefRaster is
#' not NULL. dataY average for regular grid resampling are rounded for count models.
#' @param na.rm logical. If TRUE, removes complete dataset rows with NA values.
#' For further analysis, in particular with cross-validation procedure,
#' NA values in the dataset is a big problem. This will bias cross-validation
#' indices as they won't be calculated on the same amount of data
#' depending on covariates with NA in the model tested. Thus, although this is
#' drastic, rows of data with NA values are removed from the dataset with a warning.
#'
#' @details RefRaster recommendation : If there is spatial autocorrelation in the data,
#' use the higher resolution covariate raster as a reference or use \code{\link{spatialcor_dist}} to
#' determine smallest resolution to choose. If sampling plan is supposed not unbalanced
#' regarding covariates distribution, set RefRaster to NULL. Set to NULL if distribution
#' tested in the models will be KrigeGLM or KrigeGLM.dist (see \code{\link{AIC_indices}} or
#' \code{\link{crossV_indices}}).
#'
#'
#' @return return a dataset where variable of interest is called 'dataY'
#' (for compatibility with other functions. To be changed later to allow user defined name)
#' and where only covariates of interest are kept. 'factor_' is added to covariates
#' names that are factors (for compatibility with other functions).
#' @export
Prepare_dataset <- function(x, var = 1, cov = 2:ncol(x), coords,
proj4string = NULL, RefRaster = NULL,
datatype = "Cont", na.rm = TRUE
) {
# Remove space in names
names(x) <- make.names(names(x))
if (length(grep("_P[[:digit:]]*$", names(x))) > 0) {
names(x)[grep("_P[[:digit:]]*$", names(x))] <- paste0(names(x), "x")
message(
c("For compatibility reasons, names of covariates cannot finish by ",
"'_P[:digit:]*'. Thus 'x' was added at the end of the problematic names"))
}
# Get information from a SpatialPointsDataFrame
if (is(x)[1] == "SpatialPointsDataFrame") {
proj4string <- sp::proj4string(x)
coords <- x@coords #coordinates(x)
test.na.x <- x@data
} else if (grepl("MULTI", is(x)[1])) {
stop("Spatial Object of library sf are not yet implemented,
please provide a SpatialPointsDataFrame")
} else {
test.na.x <- x
if (!missing(coords)) {
if (is.vector(coords) & length(coords) == 2) {
coords <- data.frame(x)[,coords]
} else {
coords <- data.frame(coords)
}
}
}
# Test for NA values in covariates and remove rows
if (sum(is.na(test.na.x[,cov])) != 0) {
w.na <- unique(row(test.na.x[,cov])[which(is.na(test.na.x[,cov]))])
w.na <- w.na[order(w.na)]
if (na.rm) {
message(c("Your dataset contained covariates with NA values. ",
length(w.na), " row(s) deleted for future cross-validation modelling"))
if (!missing(coords)) {
coords <- coords[-w.na,]
}
x <- x[-w.na,]
} else {
warning(c("Your dataset contains covariates with NA values : ", length(w.na), " row(s). ",
"If you continue the analysis with the cross-validation procedure,",
"you should set na.rm = TRUE in this function. See help."))
}
}
# Verify coordinates system when spatial data or coords supplied
if (is.null(proj4string) & !missing(coords)) {
warning(c("proj4string is not specified. Non projected system is supposed:",
" +proj=longlat +datum=WGS84"))
proj4string <- "+proj=longlat +datum=WGS84"
}
# If output is not spatial data
if (missing(coords)) {
message("coords empty, output dataset will not be a SpatialPointsDataFrame")
Y_data_sp <- as.data.frame(cbind(x[,var], x[,cov]))
} else {
# If output is spatial data
Y_data_sp <- sp::SpatialPointsDataFrame(coords = coords,
data = as.data.frame(x)[,c(var, cov)],
proj4string = sp::CRS(proj4string))
}
# First column is the y variable to be fitted
names(Y_data_sp)[1] <- "dataY"
if (grepl("PA", datatype) & any(!Y_data_sp$dataY %in% c(0,1))) {
# Transform data as 0/1
if (is.character(Y_data_sp$dataY) | is.factor(Y_data_sp$dataY)) {
if (length(unique(Y_data_sp$dataY)) != 2) {
stop("Only two levels are possible for presence-absence models")
}
levels <- levels(as.factor(Y_data_sp$dataY))
message(levels[1], " was changed as 0, and ",
levels[2], " was changed as 1")
Y_data_sp$dataY <- as.numeric(as.factor(Y_data_sp$dataY)) - 1
} else if (is.numeric(Y_data_sp$dataY)) {
message("dataY was not 0/1, all numeric values above 0 were transformed as 1 for Presence-absence models")
Y_data_sp$dataY <- 1 * (Y_data_sp$dataY != 0)
} else {
stop("Values in dataY were not numeric, character or factor with two levels.")
}
} else if (any(!is.numeric(Y_data_sp$dataY))) {
stop("Values in dataY should be numeric for non PA models")
}
# List names of covariates
namesCovariates <- names(Y_data_sp)[-which(names(Y_data_sp) == "dataY")]
# All covariates that are factor are renamed 'factor' for further script
# simplifications
# IsData <- apply(t(namesCovariates), 2, function(x) is(Y_data_sp@data[, x])[1])
if (missing(coords)) {
IsNum <- unlist(lapply(Y_data_sp[,namesCovariates], is.numeric))
} else {
IsNum <- unlist(lapply(Y_data_sp@data[,namesCovariates], is.numeric))
}
if (length(which(!IsNum &
!grepl("factor_", namesCovariates))) > 0) {
namesCovariates[which(!IsNum &
!grepl("factor_", namesCovariates))] <-
paste0("factor_", namesCovariates[which(
!IsNum & !grepl("factor_", namesCovariates))])
}
names(Y_data_sp)[-which(names(Y_data_sp) == "dataY")] <- namesCovariates
# Resample the dataset using a raster grid
if (is.null(RefRaster)) {
Y_data_sample <- Y_data_sp
} else {
if (missing(coords)) {
stop(c("Cannot resample the dataset into grid if coords is not specified",
" or x is not SpatialPointsDataFrame"))
}
if (proj4string != sp::proj4string(RefRaster)) {
warning(c("Dataset projection may be different than RefRaster one. ",
"Resampling in the grid may not work"))
}
raster::values(RefRaster) <- 1:raster::ncell(RefRaster)
# In which cells are samples
Y_data_Grid <- raster::extract(RefRaster, Y_data_sp)
# Mean position
coordX_tmp <- tapply(sp::coordinates(Y_data_sp)[, 1], Y_data_Grid, mean)
coordY_tmp <- tapply(sp::coordinates(Y_data_sp)[, 2], Y_data_Grid, mean)
if (grep("PA", datatype)) {
# Presence if at least one presence in cell
dataY <- tapply(Y_data_sp$dataY, Y_data_Grid, max)
modelselect_opt$datatype <- "PA"
} else {
# Mean of y variable in each cell
dataY <- tapply(Y_data_sp$dataY, Y_data_Grid, mean)
if (grep("Count", datatype)) {
Y_data_sample$dataY <- round(Y_data_sample$dataY)
message(c("Variable has been rounded to be used as count data ",
"with the regular grid resampling procedure"))
if (!grepl("Count", modelselect_opt$datatype)) {
warning(c("You should also choose one of the Count datatype for ",
"modelselect_opt$datatype"))
}
}
}
# Mean of covariates in each cell
phys_tmp <- lapply(t(namesCovariates), function(j) {
if (!grepl("factor_", j)) {
phys_tmp_a <- tapply(
as.numeric(as.character(Y_data_sp@data[,which(names(Y_data_sp) == j)])),
Y_data_Grid, function(x) mean(x, na.rm = TRUE))
} else {
# Class covariates - The first of the most frequent class in the grid cell is
# chosen
phys_tmp_a <- tapply(
as.character(Y_data_sp@data[,which(names(Y_data_sp) == j)]),
Y_data_Grid,
function(x) {
if (length(x) != 0) {
# factors are transformed as numeric factors for model selection
# Most frequent factor
res <- utils::tail(names(sort(table(x))), 1)
} else {
res <- NA
}
res
}) %>% as.character() %>% as.factor()
}
phys_tmp_a
})
phys_tmp <- as.data.frame(phys_tmp)
names(phys_tmp) <- namesCovariates
for (i in grep("factor_", namesCovariates)) {
phys_tmp[, i] <- as.factor(phys_tmp[, i])
}
# Save new dataset for the grid size chosen
Y_data_sample <- sp::SpatialPointsDataFrame(
coords = as.data.frame(cbind(coordX_tmp, coordY_tmp)),
data = as.data.frame(cbind(dataY, phys_tmp)),
proj4string = CRS(sp::proj4string(Y_data_sp)))
}
if (!missing(coords)) {
if (!is.null(geoR::dup.coords(sp::coordinates(Y_data_sample)))) {
# Message to me:
# First thought that using as.data.frame on dataset led to rounded coordinates
# This is false. Original data is kept safe.
# However, running dup.coords on a dataset with format data.tbl does not work.
# Thus, non need to round coordinates to number of digits = 15 (maximum number of digits
# without computing problems : see print.default)
warning(c("There are duplicated coordinates, ",
"this will prevents the use of Kriging if one of your plans"))
}
}
attr(Y_data_sample, "obs.col") <- 1
return(Y_data_sample)
}
#' Create a RefRaster for regular grid dataset resampling
#'
#' @param x SpatialPointsDataFrame
#' @param res raster resolution targeted. Vector of one or two values for x and y resolutions.
#' @param degree Unit of the resolution. TRUE for degrees, FALSE for meters. Default to FALSE
#' Size one or two, if different resolution for x and y.
#'
#'
#' @importFrom raster projection xmin xmax ymin ymax
#'
#' @export
RefRasterize <- function(x, res, degree = FALSE) {
# Two values resolution
res2 <- (length(res) == 1) * c(res, res) + (length(res) != 1) * res
if (grepl("Spatial", is(x)[1])) {
ext <- raster::extent(x)
crs <- raster::projection(x)
# Calculate resolution according to projection
if (is.na(raster::projection(x))) {
warning("x has no projection, res is supposed to be in the same unit than x")
} else {
# If Same unit, no change res is res
# Different unit
if (!sp::is.projected(x) & !degree) {
# Calculate average distance
pt <- c(xmin(x), (ymin(x) + ymax(x)) / 2)
pts <- cbind(xmax(x), (ymin(x) + ymax(x)) / 2)
dist.max.x <- sp::spDistsN1(pts, pt, longlat = TRUE) * 1000
if (res2[1] > dist.max.x) {
stop(c("x resolution is bigger than extent in x-coordinates. ",
"Extent in x in meters is ", dist.max.x))
}
pt <- c((xmin(x) + xmax(x)) / 2, ymin(x))
pts <- cbind((xmin(x) + xmax(x)) / 2, ymax(x))
dist.max.y <- sp::spDistsN1(pts, pt, longlat = TRUE) * 1000
if (res2[2] > dist.max.y) {
stop(c("y resolution is bigger than extent in y-coordinates. ",
"Extent in y in meters is ", dist.max.y))
}
# nb of columns
n.col <- floor(dist.max.x / res2[1])
res2[1] <- (xmax(x) - xmin(x)) / n.col
# nb of rows
n.row <- floor(dist.max.y / res2[2])
res2[2] <- (ymax(x) - ymin(x)) / n.row
warning(c("Your dataset is in geographic coordinates and res is in meter, ",
"resolution has been recalculated in degrees. res = ",
paste(round(res2, digits = 7), collapse = ", ")))
}
if (sp::is.projected(x) & degree) {
x.m <- data.frame(c(xmin(x),xmax(x)), c(ymin(x),ymax(x)))
names(x.m) <- c("x", "y")
sp::coordinates(x.m) <- ~x+y
sp::proj4string(x.m) <- sp::proj4string(x)
x.deg <- sp::spTransform(x.m, CRSobj = sp::CRS("+proj=longlat"))
dist.max.x <- xmax(x.deg) - xmin(x.deg)
dist.max.y <- ymax(x.deg) - ymin(x.deg)
if (res2[1] > dist.max.x) {
stop(c("x resolution is bigger than extent in x-coordinates. ",
"Extent in x in meters is ", dist.max.x))}
if (res2[2] > dist.max.y) {
stop(c("y resolution is bigger than extent in y-coordinates. ",
"Extent in y in meters is ", dist.max.y))}
# nb of columns
n.col <- round(dist.max.x / res2[1])
res2[1] <- (xmax(x) - xmin(x)) / n.col
# nb of rows
n.row <- round(dist.max.y / res2[2])
res2[2] <- (ymax(x) - ymin(x)) / n.row
warning(c("Your dataset is projected with coordinates in meters,",
"resolution has been recalculated in meters. res = ",
paste(res2, collapse = ", ")))
}
}
} else {
names(x) <- c("x", "y")
sp::coordinates(x) <- ~x+y
crs <- NA
warning(c("x is a data.frame, no projection will be assigned. ",
"res is supposed to be in the same unit than x"))
}
ext <- raster::extent(x)
nCol <- trunc((ext[2] - ext[1])/res2[1]) + 2
nRow <- trunc((ext[4] - ext[3])/res2[2]) + 2
r <- raster::raster(xmn = ext[1] - res2[1],
xmx = ext[1] - res2[1] + nCol * res2[1],
ymn = ext[3] - res2[2],
ymx = ext[3] - res2[2] + nRow * res2[2], ncol = nCol,
nrow = nRow, crs = sp::proj4string(x), vals = 1:(nRow * nCol))
r
}
#' @title Figure to see spatial autocorrelation that may allow to define the grid
#' size for gridded procedure
#'
#' @description Figure to see spatial autocorrelation that may allow to define the grid
#' size for gridded procedure. Two figures are produced with different distance step.
#'
#' @param x A SpatialPointsDataFrame
#' @param y The column number on which to calculate correlation.
#' Otherwise a column should be names "dataY"
#' @param longlat logical. longlat data or not. Default to FALSE
#' @param max1 Numeric. maximum distance of panel 1 in meters.
#' Default to max distance divided by 3.
#' @param lag1 Numeric. step distance for calculating autocorrelation in meters.
#' Default to max1/100.
#' @param max2 Numeric. maximum distance of panel 1 in meters. Default to max1/10.
#' @param lag2 Numeric. step distance for calculating autocorrelation in meters.
#' Default to max2/100.
#' @param binomial Logical. Presence-absence data (TRUE) or continuous data (FALSE).
#' @param thd logical. If TRUE, the function suggest a distance threshold according to
#' breakpoint in correlation. This is only a suggestion. nls function is used.
#' @param plot Logical.
#' @param saveWD directory where to save the output figure. If null, figure appears on screen.
#' @param figname character. If saveWD is not empty, you can specify the name of
#' the output figure (without extension). Default to "Correlogram".
#' @param simplify.grid logical. If dataset is too big, it will be difficult to calculate
#' all distances between all points. A simplification is to divide the area into a grid
#' and calculate distances and correlation in each cell of the grid separately.
#' Results are then merged. Grid used depends on the largest of max1 and max2 values.
#' If less than 10 values, there are randomly merged with another cell
#'
#' @importFrom grDevices png jpeg
#'
#' @return A figure of correlation is showed or saved (in saveWD).
#' A threshold is suggested if thd=TRUE
#' @export
spatialcor_dist <- function(x, y, longlat = FALSE, max1, lag1, max2,
lag2, binomial = TRUE, thd = TRUE,
plot = TRUE, saveWD, figname,
simplify.grid = FALSE) {
if (is(x)[1] != "SpatialPointsDataFrame") {
stop("x must be a SpatialPointsDataFrame")
}
if (sum(grepl("dataY", names(x))) == 0 & missing(y)) {
stop("One column should be named 'dataY' or y should be specified,\nplease use 'Prepare.dataset' function if needed")
}
if (!missing(y)) {names(x)[y] <- "dataY"}
if (simplify.grid) {
if (missing(max1) & missing(max2)) {
stop("In the simplify.grid procedure, max1 or max2 must be specified !")
} else {
if (missing(max1)) {res <- max2}
if (missing(max2)) {res <- max1}
if (!missing(max1) & !missing(max2)) {res <- max(max1, max2)}
}
x.r <- RefRasterize(x = x, res = res)
x.cells <- raster::extract(x.r, x)
# Find cells with less than 10 values
w.less <- which(x.cells %in% names(table(x.cells))[which(table(x.cells) <= 10)])
x.cells[w.less] <- sample(names(table(x.cells))[which(table(x.cells) > 10)],
size = length(w.less), replace = TRUE)
} else {
x.cells <- 1
}
weight <- numeric(0)
meanbyDist_all <- numeric(0)
meanbyDistB_all <- numeric(0)
for (grid in unique(x.cells)) { # grid <- unique(x.cells)[10]
if (simplify.grid) {
x.grid <- x[which(x.cells == grid),]
} else {
x.grid <- x
}
if (!sp::is.projected(x.grid)) {
longlat <- TRUE
}
xyDist <- sp::spDists(x.grid, longlat = longlat)
if (longlat) {xyDist <- xyDist * 1000} # in meters
diag(xyDist) <- NA
xyDist2 <- as.data.frame.table(xyDist)
xyDist2[, 1] <- as.numeric(as.factor(xyDist2[, 1]))
xyDist2[, 2] <- as.numeric(as.factor(xyDist2[, 2]))
if (missing(max1)) {
max1 <- max(xyDist, na.rm = TRUE)/3
}
if (missing(lag1)) {
lag1 <- max1/100
}
if (missing(max2)) {
max2 <- max(xyDist, na.rm = TRUE)/10
}
if (missing(lag2)) {
lag2 <- max2/100
}
# Divide distances into classes
seqdist <- seq(0, max1, lag1)
cutxyDist <- cut(xyDist2[, 3], breaks = seqdist, labels = seqdist[-length(seqdist)])
seqdistB <- seq(0, max2, lag2)
cutxyDistB <- cut(xyDist2[, 3], breaks = seqdistB, labels = seqdistB[-length(seqdistB)])
# test if pres is the same between stations %in% c('PA','PASeuil','PAGLM')
if (binomial) {
testequal <- (x.grid$dataY[xyDist2[, 1]] == x.grid$dataY[xyDist2[, 2]])
} else {
testequal <- ((x.grid$dataY[xyDist2[, 1]] - x.grid$dataY[xyDist2[, 2]]) ^ 2) /
length(x.grid$dataY)
}
# Calculate the similarity into each class
meanbyDist <- tapply(testequal, cutxyDist, function(x) mean(x, na.rm = TRUE))
meanbyDistB <- tapply(testequal, cutxyDistB, function(x) mean(x, na.rm = TRUE))
# sdbyDistB <- tapply(testequal, cutxyDistB, function(x) sd(x, na.rm = TRUE))
if (simplify.grid) {
weight <- c(weight, length(which(x.cells == grid)))
meanbyDist_all <- cbind(meanbyDist_all, meanbyDist)
meanbyDistB_all <- cbind(meanbyDistB_all, meanbyDistB)
}
}
rm(xyDist, xyDist2); gc()
# Regroup outputs of loop
# Average according to number of points
if (simplify.grid) {
meanbyDist <- apply(meanbyDist_all, 1, function(x) {
if (sum(is.na(x)) != length(x)) {
sum(x * weight, na.rm = TRUE) / sum(weight[!is.na(x)])
} else {
NA
}
})
meanbyDistB <- apply(meanbyDistB_all, 1, function(x) {
if (sum(is.na(x)) != length(x)) {
sum(x * weight, na.rm = TRUE) / sum(weight[!is.na(x)])
} else {
NA
}
})
}
if (binomial) {
ymin <- -0.01
ymax <- 1.05
laby <- "Correlation"
} else {
ymin <- min(c(meanbyDist, meanbyDistB), na.rm = TRUE)
ymax <- max(c(meanbyDist, meanbyDistB), na.rm = TRUE)
laby <- "MSE" # Mean squared error
}
# Find the recommended distance threshold according to break in correlation
if (thd) {
# thd 1
dist <- as.numeric(as.character(names(meanbyDist))) + 0.5 * lag1
lm.init <- lm(meanbyDist ~ dist)
res <- apply(t(dist), 2, function(z) {
aic <- NA
try(aic <- AIC(nls(meanbyDist ~
(dist <= z) * (A * dist + B) +
(dist > z) * (D * (dist - z) + A * z + B),
start = list(A = coef(lm.init)[2],
B = coef(lm.init)[1],
D = 0))), silent = TRUE)
aic
})
dist.min <- dist[which(res == min(res, na.rm = TRUE))]
lm1 <- nls(meanbyDist ~ (dist <= dist.min) * (A * dist + B) + (dist > dist.min) *
(D * (dist - dist.min) + A * dist.min + B),
start = list(A = coef(lm.init)[2],
B = coef(lm.init)[1], D = 0))
# thd 2
dist2 <- as.numeric(as.character(names(meanbyDistB))) + 0.5 * lag2
lm.init <- lm(meanbyDistB ~ dist2)
res <- apply(t(dist2), 2, function(z) {
aic <- NA
try(aic <- AIC(nls(meanbyDistB ~
(dist2 <= z) * (A * dist2 + B) +
(dist2 > z) * (D * (dist2 - z) + A * z + B),
start = list(A = coef(lm.init)[2],
B = coef(lm.init)[1], D = 0))),
silent = TRUE)
aic
})
dist2.min <- dist2[which(res == min(res, na.rm = TRUE))]
lm2 <- nls(meanbyDistB ~ (dist2 <= dist2.min) * (A * dist2 + B) +
(dist2 > dist2.min) * (D * (dist2 - dist2.min) +
A * dist2.min + B),
start = list(A = coef(lm.init)[2],
B = coef(lm.init)[1], D = 0))
}
# Plot the correlogram
if (plot) {
if (!missing(saveWD)) {
if (!dir.exists(saveWD)) {dir.create(saveWD)}
if (missing(figname)) {figname <- "Correlogram"}
jpeg(filename = paste0(saveWD, "/", figname, ".jpg"), width = 16, height = 8, units = "cm",
pointsize = 5, quality = 100, bg = "white", res = 300)
} else {
dev.new()
}
par(mfrow = c(2, 1), mar = c(4, 4, 3, 1))
plot(as.numeric(as.character(names(meanbyDist))) + 0.5 * lag1, meanbyDist,
pch = 20, cex = 1, ylim = c(ymin, ymax), xlim = c(0, max1), xaxs = "i",
xlab = "distance (m)", ylab = laby,
main = paste("Correlogram : lag=", round(lag1, digits = 2), "m", sep = ""))
abline(v = c(lag1, 2 * lag1, 5 * lag1, 10 * lag1),
col = "grey", lty = "dashed")
axis(1, at = c(lag1, 2 * lag1, 5 * lag1, 10 * lag1),
labels = c("lag1", "2x", "5x", "10x"), line = 1.5,
col.axis = "grey40", col = "grey40")
if (thd) {
# thd1
lines(dist, (dist <= dist.min) * (coef(lm1)[1] * dist + coef(lm1)[2]) +
(dist > dist.min) * (coef(lm1)[3] * (dist - dist.min) + coef(lm1)[1] *
dist.min + coef(lm1)[2]),
col = "orange", lwd = 1.5)
abline(v = dist.min)
axis(1, at = dist.min, labels = paste("thd1 =", round(dist.min, digits = 1)),
line = -0.8,
col.axis = "grey40",
col = "grey40")
}
plot(as.numeric(as.character(names(meanbyDistB))) + 0.5 * lag2, meanbyDistB,
pch = 20, cex = 1, ylim = c(ymin, ymax), xlim = c(0, max2), xaxs = "i",
xlab = "distance (m)", ylab = laby,
main = paste("Correlogram : lag=", round(lag2, digits = 2), "m", sep = ""))
abline(v = c(lag2, 2 * lag2, 5 * lag2, 10 * lag2), col = "grey", lty = "dashed")
axis(1, at = c(lag2, 2 * lag2, 5 * lag2, 10 * lag2),
labels = c("lag2", "2x", "5x", "10x"), line = 1.5,
col.axis = "grey", col = "grey")
if (thd) {
# thd2
lines(dist2, (dist2 <= dist2.min) *
(coef(lm2)[1] * dist2 + coef(lm2)[2]) +
(dist2 > dist2.min) *
(coef(lm2)[3] * (dist2 - dist2.min) +
coef(lm2)[1] * dist2.min + coef(lm2)[2]),
col = "orange", lwd = 1.5)
abline(v = dist2.min)
axis(1, at = dist2.min,
labels = paste("thd2 =", round(dist2.min, digits = 2)),
line = -0.8, col.axis = "grey40",
col = "grey40")
}
if (!missing(saveWD)) {
dev.off()
}
}
if (thd) {
thd.out <- c(dist.min, dist2.min)
names(thd.out) <- paste0("thd",1:2)
return(thd.out)
}
}
#' Merge covariate rasters from different extents and projections into a multi-layer Raster
#'
#' @param cov.paths vector of paths of all raster files to be added in the multi-layer raster
#' @param r.ref Raster used as a reference for extent, resolution, crs.
#' Default is the first one of the list.
#' @param ext Optional object of class extent to crop r.ref before stack.
#' @param outWD Directory where to save new reprojected, cropped rasters.
#' Default to "tempdir()/extent".
#' @param names.cov Optional. vector of names of multi-layer raster. Same length as x.
#' @param is.factor position of covariates that are factors or categories.
#' Resampling (if required) is bilinear by default but "ngb" (nearest neighbour) for factors.
#' @param verbose level of messages shown
#' @param cl a cluster as made with \code{\link[parallel]{makeCluster}}.
#' If cl is empty, nbclust in \code{\link{modelselect_opt}} will be used
#' to create a cluster.
#'
#' @export
#'
#' @details Rasters are merged in a unique RasterStack. If the projection of
#' a raster is different than the reference (r.ref), it is reprojected. If the
#' extent is different, it is resampled. If the resolution is smaller than
#' 0.9*res, it is first aggregated, then resampled.
Prepare_covarStack <- function(cov.paths, r.ref, ext, outWD,
names.cov, is.factor, verbose = 1,
cl = NULL) {
if (missing(outWD)) {
outWD <- paste0(tempdir(), "/extent")
}
if (!dir.exists(outWD)) {
dir.create(outWD)
}
if (is.null(cl)) {
cl_inside <- TRUE
} else {
cl_inside <- FALSE
options(rasterClusterObject = cl)
options(rasterClusterCores = length(cl))
options(rasterCluster = TRUE)
options(rasterClusterExclude = NULL)
}
nbclust <- modelselect_opt$nbclust
if (missing(r.ref)) {
r.ref <- raster::raster(cov.paths[1])
}
if (!missing(ext)) {
r.ref <- raster::crop(r.ref, ext)
}
method <- rep("bilinear", length(cov.paths))
if (!missing(is.factor)) {
method[is.factor] <- "ngb"
}
for (i in 1:length(cov.paths)) {
r.x <- raster::raster(cov.paths[i])
name.file <- strsplit(basename(cov.paths[i]),
raster::extension(cov.paths[i]), fixed = TRUE)[[1]][1]
if (!missing(names.cov)) {
name.x <- names.cov[i]
} else {
name.x <- name.file
}
file.ext <- paste0(outWD, "/", gsub("_extent", "", name.file), "_extent.tif")
# Re-write projection if same proj written differently
if (sp::proj4string(r.x) != sp::proj4string(r.ref)) {
#' Method depends if projection is really different,
#' otherwise this is only a change in name of proj
if (is_ProjEqual(x = sp::proj4string(r.x), y = sp::proj4string(r.ref))) {
sp::proj4string(r.x) <- sp::proj4string(r.ref)
if (verbose >= 1) {
message(paste("Equal proj modified", i, ":", name.x))
}
} else {
# Use resample as shown below
# compareRaster will return FALSE
if (verbose >= 1) {
message(paste("projectRaster", i, "=", name.x))
}
if (cl_inside) {
raster::beginCluster(nbclust)
}
r.ext <- raster::projectRaster(
r.x,
crs = sp::proj4string(r.ref),
method = method[i]
)
r.x <- r.ext
rm(r.ext); gc()
if (cl_inside) {
raster::endCluster()
}
}
}
# Resample rasters if needed
if (!raster::compareRaster(r.x, r.ref, stopiffalse = FALSE)) {
if (file.exists(file.ext)) {
print(paste("load resampled", i, "=", name.x))
r.ext <- raster::raster(file.ext)
} else {
if (cl_inside) {
raster::beginCluster(nbclust)
}
if (raster::xres(r.x) < 0.9 * raster::xres(r.ref) |
raster::yres(r.x) < 0.9 * raster::yres(r.ref)) {
# Aggregate before resampling if resolution is far from the target
if (verbose >= 1) {
message(paste("aggregate", i, "=", name.x,
"with", method[i]))
}
r.x <- raster::aggregate(
r.x,
fact = trunc(c(raster::xres(r.ref)/raster::xres(r.x),
raster::yres(r.ref)/raster::yres(r.x))),
fun = function(x, na.rm = na.rm) {
if (method[i] == "bilinear") {
res <- mean(x, na.rm = na.rm)
} else {
# Most frequent factor
res <- as.numeric(as.character(
utils::tail(names(sort(table(x))), 1)))
if (length(res) == 0) {
res <- NA
}
}
res
},
na.rm = TRUE)
}
if (verbose >= 1) {
message(paste("resample", i, "=", name.x, "with", method[i]))
}
r.ext <- raster::resample(r.x, r.ref, method = method[i],
filename = file.ext,
overwrite = TRUE)
if (cl_inside) {
raster::endCluster()
}
}
} else {
r.ext <- r.x
}
if (i == 1) {
r.multi <- r.ext
} else {
r.multi <- raster::stack(r.multi, r.ext)
}
}
if (!missing(names.cov)) {
names(r.multi) <- names.cov
}
return(r.multi)
}
#' @title Function to extract covariates from different rasters
#'
#' @description Original covariates raster may have different projections.
#' CovarExtract allows to extract in these different projections.
#'
#' @param x SpatialPointsDataFrame
#' @inheritParams Prepare_covarStack
#'
#' @export
CovarExtract <- function(x, cov.paths, names.cov, is.factor) {
if (is(x)[1] != "SpatialPointsDataFrame") {
stop("x should be a SpatialPointsDataFrame")
}
res.list <- lapply(1:length(cov.paths), function(i) {
r.tmp <- raster::raster(cov.paths[i])
if (!is_ProjEqual(sp::proj4string(r.tmp), sp::proj4string(x))) {
x.proj <- sp::spTransform(x, raster::crs(r.tmp))
} else {x.proj <- x}
res <- data.frame(raster::extract(r.tmp, x.proj))
names(res) <- names(r.tmp)
res
})
res.tab <- do.call("cbind", res.list)
if (!missing(names.cov)) {
names(res.tab) <- names.cov
}
if (!missing(is.factor)) {
for (i in is.factor) {
res.tab[,i] <- as.factor(as.character(res.tab[,i]))
}
}
sp::cbind.Spatial(x, res.tab)
}
#' @title Function to identify covariates that are correlated in terms of Spearman's rank
#'
#' @description Correlation engender problems of identifiability.
#' Correlated parameters in the dataset will be separated in tested models.
#' Test for the Spearman factor for non linear correlation between covariates
#' of all the stations.
#' Complete the test with a visual test if needed.
#'
#'
#' @param x dataframe of covariates only, on which to test correlation of covariates.
#' This can also be dataset as issued from \code{\link{Prepare_dataset}}.
#' @param rm vector of column numbers to be removed from the analysis. Default to NULL.
#' If you specify your complete dataset as \code{x}, you may define \code{rm=1} to
#' remove your observations from the columns.
#' @param visual logical. Whether to define visually if data are considered
#' correlated or not. See details. Be careful, all previous figures are
#' closed with graphics.off() before running visual analysis.
#' @param thd numeric. Correlation (absolute) value above which to consider that covariates
#' are correlated and should not remain in the same model. This value is necessary
#' when visual = FALSE. See details.
#' @param plot logical. Whether to plot the figure of correlation values between
#' covariates. Set to FALSE if using a MPI cluster.
#' @param saveWD path to directory where to save a jpg figure file. If NULL and
#' plot = TRUE, then figure is shown on screen.
#' @param figname character. The name (w/o extension) of the figure to be saved in saveWD.
#' @param img.size size in cm of the output (square) image. May increase labels size as well.
#'
#' @importFrom magrittr %>%
#' @importFrom methods is
#' @importFrom grDevices graphics.off dev.new dev.set dev.prev dev.off png
#' @import graphics
#'
#' @details
#' \itemize{
#' \item Correlation of covariates is to be tested in the dataset itself. It is not
#' important if covariates are correlated in real life. What affect model fits
#' is the correlation within the dataset. Therefore is this function...
#' \item Spearman's rank correlation coefficient has been chosen as it allows to
#' test for correlation between categorical and continuous variables. This may
#' look as a non-sense to test for correlation with categorical covariates.
#' In some cases, the order of classes inside a category may have a sense, e.g.
#' continuous variable that has been categorised for any reason. In this function
#' classes are temporary turned into numbers to calculate correlation.
#' This implies that the alphanumeric order of classes has a sense.
#' The Spearman test is also less sensitive to non-gaussian distributions of data
#' \item In some cases, correlation value is high because of one extreme rank value.
#' Visual verification allows to define for each couple of covariates if the user
#' may consider correlation or not. Figures are grouped by correlation values
#' which accelerates the visual verification.
#' \item It is suggested to do a visual verification the first time data are
#' processed or to do a complete real exploration of your dataset before
#' running any model. This exploration may suggest to remove or combine
#' covariates before starting the modelling procedure. After that, you may be
#' able to define a thd value for covariate correlation limit.
#' \item Defining a high thd value may let some correlated covariates to appear in
#' the same model in the following of the procedure. Nevertheless, because the
#' procedure of this package is based on cross-validation, if two variables are
#' correlated they may likely have a low score as they will be as efficient as
#' covariate alone. Thus, if you hesitate between two thd values, choose the
#' higher one, as it will allow more models to pass through the selection procedure.
#' }
#'
#' @export
Param_corr <- function(x, rm = NULL, visual = FALSE, thd = 0.7, plot = TRUE, saveWD = NULL,
figname = "Covariate_correlation", img.size = 12)
{
if (!is.null(attr(x, "obs.col"))) {
rm <- unique(c(rm, attr(x, "obs.col")))
}
# if (visual) {require(tcltk)}
if (is(x)[1] == "SpatialPointsDataFrame") {
if (!is.null(rm)) {
physParam_NewSp <- x@data[,-rm]
} else {
physParam_NewSp <- x@data
}
} else {
if (!is.null(rm)) {
physParam_NewSp <- x[,-rm]
} else {
physParam_NewSp <- x
}
}
# All combinations of 2 covariates
# Change names so that names order reflect column order
names.save <- names(physParam_NewSp)
names(physParam_NewSp) <-
paste0("p", formatC(1:ncol(physParam_NewSp),
width = nchar(ncol(physParam_NewSp)),
flag = "0"), names(physParam_NewSp))
# Change factor as numeric for correlation only
if (any(sapply(physParam_NewSp, function(x) !is.numeric(x)))) {
message("Variables are transformed as numeric for correlation tests only. ",
"This may not be optimal, but this will not alter the rest of the analysis")
}
fig.tmp <- physParam_NewSp %>%
dplyr::mutate_at(.vars = grep("^factor_", names(physParam_NewSp)),
.funs = function(x) as.numeric(as.factor(x))) %>%
dplyr::mutate_if(sapply(., function(x) !is.numeric(x)), as.numeric)# %>%
# dplyr::mutate_if(sapply(., is.factor), as.numeric)
# Calculate correlation with corrr
corSpearman.tmp <- fig.tmp %>%
corrr::correlate(method = "spearman") %>%
dplyr::mutate_all(function(x) ifelse(is.na(x), 1, x))
class(corSpearman.tmp) <- c("cor_df", class(corSpearman.tmp))
# dplyr 0.8.0
corSpearman <- corSpearman.tmp %>%
corrr::shave() %>%
tidyr::gather(key = "Var2", value = "Corr", -term) %>%
dplyr::rename(Var1 = term) %>%
#fashion(leading_zeros = TRUE) %>%
# as.data.frame.table() %>%
# filter(Freq != "") %>%
dplyr::mutate_at(
.vars = 1:2,
# .funs = dplyr::funs("num" = as.numeric(as.factor(.)))) %>%
.funs = list("num" = ~as.numeric(as.factor(.x)))) %>%
dplyr::filter(!is.na(Corr)) %>%
as.data.frame()
# Test visually each couple of covariates using tcltk
if (visual)
{
warning("Visual analysis uses library(tcltk) which may not work
properly using Rstudio.")
graphics.off()
cutSpearman <- cut(abs(corSpearman$Corr), breaks = seq(0,1,0.1))
keep <- logical(0)
placeX <- logical(0)
# Test correlations in decreasing correlation
for (val in length(levels(cutSpearman)):4) {
# 4 is for no test under correlation < 0.3 which is too low for any corr
val1 <- levels(cutSpearman)[val]
X <- which(cutSpearman == val1)
placeX <- c(placeX, X)
if (length(X) != 0) {
seq.fig <- c(seq(1, length(X), 25), length(X))
for (n.fig in 1:(length(seq.fig) - 1)) {
dev.new()
# Draw bowplot of couples of covariates
if (length(X) <= 6) {par(mfrow = c(2,3), ask = TRUE)}
if (length(X) > 6) {par(mfrow = c(3,4), ask = TRUE)}
if (length(X) > 12) {par(mfrow = c(5,5), ask = TRUE)}
for (ii in seq.fig[n.fig]:seq.fig[n.fig + 1]) {
if (length(X) > 25 & ii == seq.fig[n.fig] + 2) {
mtext(text = paste("Figure", n.fig), side = 3,
line = 1, outer = TRUE)
}
boxplot(rank(physParam_NewSp[,unlist(corSpearman[ii,4])],
na.last = "keep") ~
round(rank(physParam_NewSp[,unlist(corSpearman[ii,5])],
na.last = "keep")/20),
main = val1,
xlab = names(physParam_NewSp)[unlist(corSpearman[ii,4])],
ylab = names(physParam_NewSp)[unlist(corSpearman[ii,5])])
}
if (length(X) > 12) {
dev.set(dev.prev())
}
} # n.fig
# Fonction to ask if corr is retained or not
tt <- tcltk::tktoplevel()
tl <- tcltk::tklistbox(tt, height = 4, selectmode = "single",
background = "white")
tcltk::tkgrid(tcltk::tklabel(tt, text = "Is it overall visually correlated ?"))
tcltk::tkgrid(tl)
fruits <- c("yes_all", "no_none", "yes_somes", "")
for (i in (1:4))
{
tcltk::tkinsert(tl, "end", fruits[i])
}
tcltk::tkselection.set(tl,3)
OnOK <- function()
{
rbVal <- fruits[as.numeric(tcltk::tkcurselection(tl)) + 1]
tcltk::tkdestroy(tt)
if (rbVal == "yes_all") {tcltk::tkmessageBox(
message = "All is considered correlated")
assign("truc", 1)}
if (rbVal == "no_none") {tcltk::tkmessageBox(
message = "Nothing is considered correlated")
assign("truc", 0)}
if (rbVal == "yes_somes") {tcltk::tkmessageBox(
message = "You need to choose which are correlated,
put 1 in the table")
assign("truc", -1)}
}
truc <- NA
OK.but <- tcltk::tkbutton(tt,text = " OK ", command = OnOK)
tcltk::tkgrid(OK.but)
tcltk::tkfocus(tt)
repeat {if (is.na(truc) == FALSE) {break}}
# Assign data chosen to the list of data
if (truc == -1) {
ind <- cbind(names(physParam_NewSp)[unlist(corSpearman[X,4])],
names(physParam_NewSp)[unlist(corSpearman[X,5])], 1)
utils::fix(ind)
keep <- c(keep, ind[,3])
} else {
keep <- c(keep, rep(truc, length(X)))
}
} # if X != 0
}
par(ask = FALSE)
} # end of visual == TRUE
# Store whether the parameters are correlated or not
corSpearman$valid <- 0
if (visual) {
corSpearman$valid[placeX] <- keep
} else {
# Parameters with spearman rank correlation higher than 0.8 wont be in the same model
message(c("thd value is set to "), thd)
corSpearman$valid[which(abs(corSpearman$Corr) >= thd)] <- 1
}
if (plot) {
# Figure of correlations coefficients
fig.cor <- fig.tmp %>% cor(method = "spearman")
fig.cor[which(is.na(fig.cor))] <- 1
colnames(fig.cor) <- rownames(fig.cor) <- names.save
# Test with ggplot
# library(ggplot2)
# ggplot(data = corSpearman, aes(Var2, Var1, fill = Corr)) +
# geom_tile(color = "white") +
# scale_fill_gradient2(low = "blue", high = "red", mid = "white",
# midpoint = 0, limit = c(-1,1), space = "Lab",
# name = "Pearson\nCorrelation") +
# theme_minimal()+
# theme(axis.text.x = element_text(angle = 45, vjust = 1,
# size = 12, hjust = 1)) +
# coord_fixed()
# Add shade lines for couples removed from models
fig.cor.0 <- fig.cor * 0
for (x in 1:nrow(corSpearman)) {
fig.cor.0[corSpearman[x, "Var1_num"], corSpearman[x,"Var2_num"]] <-
fig.cor.0[corSpearman[x,"Var2_num"], corSpearman[x,"Var1_num"]] <-
corSpearman$valid[x]
}
# Do a loop to use suggested dev size of corrplot (less white margins)
ratio <- 1
# for (loop.fig in 1:2) {
# To reset par()
par(mar = c(0,0,0,0)); plot.new(); dev.off()
if (!is.null(saveWD)) {
png(filename = paste0(saveWD, "/", figname, ".png"),
width = img.size, height = img.size * ratio, units = "cm", pointsize = 5,
#quality = 100,
res = 500
)
} else {
dev.new()
}
# Correlation plot
corrplot::corrplot(
fig.cor, #mar = c(0, 0.5, 0.5, 0.5),
tl.cex = 2.5 * min(img.size, img.size * ratio) / ncol(physParam_NewSp), method = "square",
diag = FALSE,
tl.srt = 45, # cex.main = 2.5,
title = "",
type = "lower",
# tl.pos = "n",
tl.col = "black",
cl.cex = 1.5 * min(img.size, img.size * ratio) / ncol(physParam_NewSp) + ncol(physParam_NewSp) / 150,
cl.ratio = 0.1,
# h.ratio = TRUE,
win.asp = ratio)
# Keep or not keep couples
corrplot::corrplot(
fig.cor.0,
tl.cex = img.size / ncol(physParam_NewSp),
method = "shade",
col = c("transparent"), addshade = "positive",
shade.col = "grey",
shade.lwd = 2 * min(img.size, img.size * ratio) / ncol(physParam_NewSp),
diag = FALSE, add = TRUE, bg = "transparent",
tl.pos = "n", cl.pos = "n",
type = "lower", tl.col = "black")
if (!is.null(saveWD)) {
dev.off()
# Remove white margins
# if (loop.fig == 2) {
Rm_WhiteMargins(paste0(saveWD, "/", figname, ".png"), pixel = 5)
# }
}
#} # end of loop.fig
} # end of plot
corSpearman <- corSpearman %>%
dplyr::mutate(Var1 = names.save[Var1_num]) %>%
dplyr::mutate(Var2 = names.save[Var2_num])
return(corSpearman)
} # End of ParamCorr function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.