Requirements : - script is runned right after prepare_and_run_GDM.Rmd. Requires its output gdminput. - note that gdminput is also saved as .csv and .Rds from the above mentioned script. - nicenames for nice correlation plot - model_names_selection : a row of the dataframe model_names

The script calculates euclidean distances abs(simple substraction) of the covariates and LUI as well as the geographic distance based on coordinates and forms an unscaled and a scaled table with all distances. A correlation plot is produced.

For more exploration, see also the script check_GDM_input_LM_GAM.Rmd with vif, linear models and GAMS. The i.i.d. assumption is not met with matrix input, therefore these explorations have been moved to additional results.

Output : - datasets <modelname>_gdminput_distances_unscaled.rds and <modelname>_gdminput_distances_zscores.rds contains the distances of all variables in long format (unlike the GDM input data which is raw values for all non-beta predictors) - correlation plot vignettes/out/<modelname>_corrplot_before_gdm.pdf

creating dataset

lui <- c(model_names_selection$lui, paste("delta", model_names_selection$lui, sep = ""))
checkgdminput <- data.table::copy(gdminput)
checkgdminput <- data.table(checkgdminput)

# calc euclidean distance between each predictor pair of checkgdminput
# diversity : take out s2. because they are all 0
dists <- colnames(checkgdminput)[grep("beta", colnames(checkgdminput))]
dists <- grep("s2.", dists, value = T)
checkgdminput[, (dists) := NULL]
checkgdminput[, weights := NULL]

# LUI
luicols <- grep(paste(lui, collapse="|"), colnames(checkgdminput), value=TRUE)
if(length(grep("Gstd", lui)) > 1){
  print("Grazing")
  checkgdminput[, Gstd := abs(s1.Gstd - s2.Gstd)]
  checkgdminput[, deltaGstd := abs(s1.deltaGstd - s2.deltaGstd)]
  checkgdminput[, (luicols) := NULL]
} else if(length(grep("Mstd", lui)) > 1) {
  print("Mowing")
  checkgdminput[, Mstd := abs(s1.Mstd - s2.Mstd)]
  checkgdminput[, deltaMstd := abs(s1.deltaMstd - s2.deltaMstd)]
  checkgdminput[, (luicols) := NULL]
} else if(length(grep("Fstd", lui)) > 1) {
  print("Fertilisation")
  checkgdminput[, Fstd := abs(s1.Fstd - s2.Fstd)]
  checkgdminput[, deltaFstd := abs(s1.deltaFstd - s2.deltaFstd)]
  checkgdminput[, (luicols) := NULL]
} else if(length(grep("LUI", lui)) > 1) {
  print("LUI")
  checkgdminput[, LUI := abs(s1.LUI - s2.LUI)]
  checkgdminput[, deltaLUI := abs(s1.deltaLUI - s2.deltaLUI)]
  checkgdminput[, meanLUI := rowMeans(.SD), .SDcols = c("s1.LUI", "s2.LUI")]
  checkgdminput[, sdLUI := apply(.SD, 1, sd), .SDcols = c("s1.LUI", "s2.LUI")]
  checkgdminput[, (luicols) := NULL]
}

#TODO keep an eye on distGeo, removed the suggestion to "raster" package which 
# depends on retired packages.
checkgdminput[, geosphere_distgeo := geosphere::distGeo(
  p1 = as.matrix(checkgdminput[, .(s1.yCoord, s1.xCoord)]),
  p2 = as.matrix(checkgdminput[, .(s2.yCoord, s2.xCoord)])
  )]
geocol <- grep("Coord", colnames(checkgdminput), value = T)
checkgdminput[, (geocol) := NULL]
# checkgdminput[, s2.geosphere_distgeo := NULL]

# predictors
checkgdminput[, edis_soil := abs(s1.edis_soil - s2.edis_soil)]
checkgdminput[, plot_isolation := abs(s1.plot_isolation - s2.plot_isolation)]
del <- c("s1.edis_soil", "s2.edis_soil", "s1.plot_isolation", "s2.plot_isolation")
checkgdminput[, (del) := NULL]

# rename s1 from diversity columns
colnames(checkgdminput) <- gsub("s1.", "", colnames(checkgdminput))

# compute z-score
# observe correlations across the z-scores of variables, not the raw ones.
to_scale <- grep("beta|distance", colnames(checkgdminput), invert = T, value = T)
scaled_checkgminput <- data.table::copy(checkgdminput)
scaled_checkgminput[, (to_scale) := lapply(.SD, scale, center = T, scale = T), .SDcols = to_scale]
rm(to_scale)

# save datasets
saveRDS(checkgdminput, 
        file = paste("vignettes/out/", model_names_selection$modelname, "_gdminput_distances_unscaled.rds", sep = ""))
saveRDS(scaled_checkgminput, 
        file = paste("vignettes/out/", model_names_selection$modelname, "_gdminput_distances_zscores.rds", sep = ""))

correlation plot

## correlation plot
# use nice names for plotting, only those which are available
c <- nicenames$nicenames
names(c) <- nicenames$names
c <- c[names(checkgdminput)]
names(checkgdminput)[!is.na(c)] <- c[!is.na(c)]
# names(checkgdminput)[is.na(c)] <- c("EFdist", "geographic distance")
names(checkgdminput) <- gsub("\\\\narthropodsoillarvae", " a. larvae", names(checkgdminput))

# name : modelname "_corrplot_before_gdm.pdf"
m <- cor(checkgdminput, method = "spearman")
cairo_pdf(paste("vignettes/out/", model_names_selection$modelname, "_corrplot_before_gdm_main_models.pdf", sep = ""), width = 11.7, height = 8.3)
corrplot::corrplot(m, type = "lower", addCoef.col = "black", method = "color", diag = F, tl.srt = 50, tl.col = "black", order = "hclust", tl.cex = 0.5, cl.cex = 0.3, number.cex = 0.25)
dev.off()

m[m < 0.5 & m > -0.5] <- 0
corrplot::corrplot(m, type = "lower", addCoef.col = "black", method = "color", diag = F, tl.srt = 10, tl.col = "black", order = "hclust", tl.cex = 0.5, cl.cex = 0.3, number.cex = 0.3)

# output high correlations as a table
h <- as.matrix(m)
h[!lower.tri(h)] <- 1000
h <- reshape2::melt(h, value.name = "correlation")
h <- data.table(h)
h <- h[!correlation == 1000, ]
h <- h[correlation > 0.7 | correlation < -0.7, ][order(correlation)]
fwrite(h, file = paste("vignettes/out/", model_names_selection$modelname, "_correlations_before_gdm_table.csv", sep = ""))

Next script is plot_GDM.Rmd.



allanecology/BetaDivMultifun documentation built on Nov. 9, 2023, 8:47 p.m.