Updates 09.02.23

#TODO : tune the mtry parameter (probably around 20 would be good)
#TODO : calculate the assembled functions AFTER imputation
#TODO : recheck negative values
#TODO : note that the variable-wise error depends on the scale --> keep in mind
# note : impute response and explanatory separately

This script : - uses raw functions from the synthesis dataset (loaded with analysis_nonpublic.R, variable raw_grlfuns) - note if taken from bexis, first run the reformatting script bexis_to_wide_format.R attached to the dataset. - The grassland functions data used for gapfilling is stored in raw_grlfuns, selected variables according to info data. raw_grlfuns is loaded by analysis_nonpublic.R. Make sure you also run the selection of functions for gapfilling! - performs gap-filling - saves the gap-filled dataset - required input : info_data, raw_grlfuns, plotNAset (the plot set defined by diversity data in calc_betadiversities.Rmd with the final set of included plots for analysis)* - output : generates gap-filled dataset impdat, and imp_err contianing the imputation error. Two variables are produced, impdat which contains the gapfilled dataset which will be used in further analysis as well as imp_err, which contains the imputation error automatically estimated by missForest. The imputed dataset is stored as .rds file.

sections_to_be_loaded <- c("assemble_functions")
source("vignettes/analysis_nonpublic.R")

The dataset used for gap-filling is larger than the dataset used for the analysis. More variables enhance the performance of the gap-filling. Note : Only use additional variables which are not used in the analysis. Otherwise, data can be circular. (This means that e.g. no soil data is used for imputation, but additional functions which are not included in the later analysis).

Data preparation

befun <- data.table::copy(raw_grlfuns)

chosing the plot set

befun <- befun[Plotn %in% plotNAset]

cleaning out variables with too many missing values

visdat::vis_miss(befun, sort_miss = T)

Values with over 21% missing data could be excluded here (mAMF hyphae, Parasitoid traps, Aggregation, PRI.2011 and NaHCO3.Pi).

Here, values with too many missings are not taken out yet. Instead, the imputation is done and the variables are only removed in case their imputation error is too high.

#USER : set treshold for missing values
treshold <- 0.21
t <- apply(befun, 2, function(x) sum(is.na(x)))
exclude <- names(which(t > 150 * treshold))
befun <- befun[, !colnames(befun) %in% exclude, with=F]
rm(t); rm(exclude); rm(treshold)

If the missing plot from 16S, amoA_AOA.2016 & Co is taken out, 3 more functions can be included in the functions set.

# befun[is.na(`16S_NB`), Plotn]
# befun[is.na(amoA_AOA.2016), Plotn]
# befun[is.na(amoA_AOB.2016), Plotn]
# visdat::vis_miss(befun[, .(`16S_NB`, amoA_AOA.2016, amoA_AOB.2016, amoA_AOA.2011)], sort_miss = T)
# The missing plot is "HEG31"
befun <- befun[Plotn != "HEG31"]

update the helper vector plotNAset.rds and take out the plot "HEG31"

# readRDS("plotNAset.rds")
plotNAset <- plotNAset[-which(plotNAset == "HEG31")]
saveRDS(plotNAset, "plotNAset.rds")

Check Correlations

Of the non-factor columns only

corrbefun <- befun[, !colnames(befun) %in% c("Explo", "Plotn", "Plot"), with=F]
M <- cor(corrbefun, use="pairwise.complete.obs")
M_before <- M # save for later comparison with M_after

# raw_grlfun_corrplot1
corrplot::corrplot(M,type="lower",addCoef.col = "black",method="color",diag=F, tl.srt=1, tl.col="black", mar=c(0,0,0,0), number.cex=0.25, tl.cex = 0.3)

# raw_grlfun_corrplot2
corrplot::corrplot(M, type = "upper", tl.col="black", tl.srt=90, diag = F, tl.cex = 0.5, mar = c(2, 0, 0, 0)) # Supplementary figure XY

# only show high correlations
tres <- 0.7
M[which(M < tres & M > -tres)] <- 0
corrplot::corrplot(M, type = "upper", tl.col="black", tl.srt=40, diag = F, tl.cex = 0.3)

Network

# show network - is there a variable "alone"? bzw. isolated?
network <- igraph::graph_from_adjacency_matrix(M, weighted=T, mode="undirected", diag=F)
plot(network)

There is no variable which is not connected with at least one line to another variable. Lines represent correlations > threshold.

rm(M); rm(network)

Log transformation

before changing values, store the original dataset

befunbackup <- data.table::copy(befun)

remove negative values, remember to add again after imputation!

find negative values

par(mar = c(10, 1, 3, 1))
boxplot(corrbefun, las=2)
# minimum of the numeric columns
negative <- colnames(corrbefun)[which(corrbefun[, lapply(.SD, function(x) min(x, na.rm=T))] < 0)]

remove negative values - shift whole vector up.

# DEA.inverted
mindea <- min(befun$DEA.inverted, na.rm=T)
befun$DEA.inverted <- befun$DEA.inverted + abs(mindea)
# NO3.2014
minno <- min(befun$NO3.2014, na.rm=T)
befun$NO3.2014 <- befun$NO3.2014 + abs(minno)
# dung.removal
mindung <- min(befun$dung.removal, na.rm=T)
befun$dung.removal <- befun$dung.removal + abs(mindung)

rm(negative)

Log transformation of the numeric columns. The natural logartithm of values smaller than 1 is negative. Therefore, all values are shifted by 1 again to avoid any negative values in the log transformed dataset.

numcols <- colnames(befun)[!colnames(befun) %in% c("Explo", "Plotn", "Plot")]
befun[, (numcols) := lapply(.SD, as.numeric), .SDcols = numcols]
# log transform the values + 1, later shift them back.
befun[, (numcols) := lapply(.SD, function(x) log(x+1)), .SDcols = numcols]
# record NA value positions for visualistation
naloc <- is.na(befun)

Tuning imputation parameters

# recoding Explo as factor for imputation
befun[, Explo := as.factor(Explo)]
dataset <- befun[, !colnames(befun) %in% c("Plotn", "Plot"), with=F]

Tuning mtry paramteter (additional)

By default square root of p = number of columns. Try values between 2 and 50 to tune the OOB error. Find the minimum of error and use the according mtry parameter value. Around 20 worked well in previous projects.

source script : https://github.com/eric-allan/BE.RE.analysis/blob/master/EF_select_suitable_functions.R by Caterina Penone

# note : only run once, then work with the output dataset.
maxiteration <- 50
sqrt(ncol(befun))
OOB_NRMSE <- c()
i <- 1
for (mtry in 2:maxiteration) {
   imp.matX <- missForest::missForest(dataset, mtry = mtry)
   OOB_NRMSE[i] <- imp.matX$OOBerror[1]
   i <- i + 1
   print(i)
}
rm(i)

write.table(OOB_NRMSE, file = "function_imputation_parameter_tuning_mtry.csv", sep = ";")

visualise mtry optimisation

OOB_NRMSE <- read.table(file = paste(pathtodata, "/data_assembly/output_data/function_imputation_parameter_tuning_mtry.csv", sep = ""), sep = ";")
OOB_NRMSE <- OOB_NRMSE$x

pdf(file = "function_imputation_parameter_tuning_mtry.pdf", paper = "a4r")
plot(seq(2, maxiteration), OOB_NRMSE, type = "l", ylab = "OOB",
     xlab = "mtry")
abline(v = which(OOB_NRMSE == min(OOB_NRMSE)) + 1, lty = "dashed", col = "orange")
abline(v = which(OOB_NRMSE[1:30] == min(OOB_NRMSE[1:30])) + 1, lty = "dashed", col = "red")
dev.off()

which(OOB_NRMSE == min(OOB_NRMSE)) + 1 # optimal mtry for explanatory is 28

Absolute minimum would be to use 45, but higher mtry means more computational resources. Find the local minimum around 20, not too high compared to abs minimu. thus use mtry = 21.

Tuning ntree paramteter (additional)

default is 100, increasing ntree can decrease the imputation error. source script : https://github.com/eric-allan/BE.RE.analysis/blob/master/EF_select_suitable_functions.R by Caterina Penone

mtry_val <- 22

OOB_NRMSE <- c()
i <- 1
for(ntree in c(100, 110, 120, 130, 180, 200)) {
   imp.matX <- missForest::missForest(dataset, mtry = mtry_val, ntree = ntree)
   OOB_NRMSE[i] <- imp.matX$OOBerror[1]
   i <- i + 1
   print(i)
}
OOB_NRMSE <- data.table(ntree = c(100, 110, 120, 130, 180, 200), OOB = OOB_NRMSE)

write.table(OOB_NRMSE, file = "function_imputation_parameter_tuning_ntree.csv", sep = ";")

rm(i); rm(ntree)

Visualise

OOB_NRMSE <- read.table(file = paste(pathtodata, "/data_assembly/output_data/function_imputation_parameter_tuning_ntree.csv", sep = ""), sep = ";")
OOB_NRMSE <- data.table(OOB_NRMSE)

OOB_NRMSE[which(OOB_NRMSE$OOB == min(OOB_NRMSE$OOB)), ntree] # 130

pdf("function_imputation_parameter_tuning_ntree.pdf")
plot(OOB_NRMSE$ntree, OOB_NRMSE$OOB, type = "l", ylab = "OOB",
     xlab = "ntree")
abline(v = OOB_NRMSE[which(OOB_NRMSE$OOB == min(OOB_NRMSE$OOB)), ntree], lty = "dashed", col = "orange")
dev.off()

ntree = 130 (not so much better, but still)

Imputation

The imputation is repeated 50 times, and the mean of the imputed values is taken.

# 50 imputations
impvals <- list()
imperr <- list()
# based on one-time imputation, take out : "amoA_AOA.2016" ,"amoA_AOB.2016",  "nxrA_NS", "16S_NB"
numcols <- numcols[which(!numcols %in% c("Plotn", "Plot"))]
# recoding Explo as factor for imputation
befun[, Explo := as.factor(Explo)]
dataset <- befun[, !colnames(befun) %in% c("Plotn", "Plot"), with=F]
for(i in 1:50){
  print(i)
  current <- missForest::missForest(dataset, variablewise = T, mtry = 21, ntree = 130)
  imperr[[i]] <- data.table::data.table("columns" = colnames(dataset), 
                                        "errors" = current$OOBerror)
  current <- data.table::as.data.table(current$ximp)
  # re-transform the numeric variables
  current[, (numcols) := lapply(.SD, function(x) (exp(x)-1)), .SDcols = numcols]
  # re-transform the negative values
  current$DEA.inverted <- current$DEA.inverted - abs(mindea)
  current$NO3.2014 <- current$NO3.2014 - abs(minno)
  current$dung.removal <- current$dung.removal - abs(mindung)
  # add back the "Plotn" column which was taken out for imputation
  current[, Plotn := befun$Plotn]
  # convert imputed data.table to matrix, as more handy for imputed values handling
  impvals[[i]] <- current
}
rm(current); rm(i); rm(mindea); rm(minno); rm(mindung)
# USER : may change filename
saveRDS(impvals, file="raw_imputed_function_values_complete.rds")
saveRDS(imperr, file = "raw_imputed_function_errors_complete.rds")
impvals <- readRDS(paste(pathtodata, "/data_assembly/output_data/raw_imputed_function_values_complete.rds", sep = ""))
imperr <- readRDS(paste(pathtodata, "/data_assembly/output_data/raw_imputed_function_errors_complete.rds", sep = ""))

Check imputation error

imperr <- do.call(rbind, imperr)
mimperr <- data.table::as.data.table(aggregate(errors ~ columns, imperr, mean))
mean(mimperr[,errors]) # 0.06827009
mean(imperr[errors != 0, errors]) # 0.1289546 mean of only the imputed values

# barplot(height=mimperr[,errors], names.arg = mimperr[, columns])
library(cowplot)
p <- ggplot2::ggplot(data=mimperr, ggplot2::aes(x=columns, y=errors)) +
  ggplot2::geom_bar(stat="identity") +
  theme(axis.text.x=element_text(color = "black", size=6, angle=30, vjust=.8, hjust=0.8)) +
  geom_hline(yintercept=0.25, linetype="dashed", color = "gray") +
  ylab("OOB imputation error")
p

The two variables Total pollinators and Parasitoid traps are removed, because the imputation error is too high and because they rely on diversity data (which is hard / not possible to impute). The other variables with imputation errors over 25% are left, because only 1-2 values are missing in those datasets. Also, the variablewise imputation error depends on the variable scale.

dataset[, c("Total.pollinators", "Parasitoid.traps") := NULL]
numcols <- numcols[-which(numcols %in% c("Total.pollinators", "Parasitoid.traps"))]
pdf("imputation_errors.pdf", paper = "a4r")
cowplot::plot_grid(naniar::vis_miss(dataset, sort_miss = T), p, nrow = 2)
dev.off()
#TODO might do nicer plot : not all labels are readable

Get together imputed dataset from 50 imputations.

# backup <- impvals
X <- do.call(rbind, impvals)
X <- data.table::as.data.table(X)
X[, (numcols) := lapply(.SD, as.numeric), .SDcols=numcols]
# backup2 <- data.table::copy(X)

# two ways of aggregating the 50 imputed values together
# Y <- aggregate(X[, ..numcols], list(Plotn=X$Plotn), mean)
Y <- X[, lapply(.SD, mean, na.rm=T), .SDcols=numcols, by=Plotn]
Y <- data.table::data.table(Y)

Compare imputed values with real values. Creates the pdf imputed_values.pdf

pdf("imputed_values.pdf", paper="a4r")
par(mfrow = c(2,7))
for(i in 1:length(numcols)){
  x <- rep(1, length(plotNAset))
  colname <- numcols[i]
  # length of the imputed value vector
  rowlocations <- naloc[, colname]
  y <- Y[rowlocations, get(colname)]
  plot(x, befunbackup[, get(colname)], xlab="", ylab="range of values", main=colname, sub=paste(length(y), "NAs"))
  x <- x[0:length(y)]
  points(x, y, col="red",pch=19)
}
dev.off()

Check correlations after imputation

corrbefun <- Y[, !colnames(Y) %in% c("Plotn", "Explos"), with=F]
M_after <- cor(corrbefun, use="pairwise.complete.obs", )

# 21-12-07_function_imputation_correlations_among_numeric_variables_after_imputation
corrplot::corrplot(M_after, type = "upper", tl.col="black", tl.srt=90, diag = F, tl.cex = 0.5, mar = c(2, 0, 0, 0)) # Potential Supplementary figure XY (compare with before imputation)

# only show high correlations
tres <- 0.7
M_after[which(M_after < tres & M_after > -tres)] <- 0
corrplot::corrplot(M_after, type = "upper", tl.col="black", tl.srt=40, diag = F, tl.cex = 0.3)

# compare correlations before and after
# filter M by columns which are still there in M_after
h <- as.matrix(M_before)
h[!lower.tri(h)] <- 1000; h <- reshape2::melt(h, value.name = "correlation"); h <- data.table(h); h <- h[!correlation == 1000, ]
M_before_dt <- h
setnames(M_before_dt, old = "correlation", new = "correlation_before")
h <- as.matrix(M_after)
h[!lower.tri(h)] <- 1000; h <- reshape2::melt(h, value.name = "correlation"); h <- data.table(h); h <- h[!correlation == 1000, ]
M_after_dt <- h

M_compare <- merge(M_before_dt, M_after_dt, by = c("Var1", "Var2"))
plot(M_compare$correlation_before, M_compare$correlation, 
     xlab = "correlations before imputation", 
     ylab = "correlations after imputation")
mtext(side=3, line=2, at=-0.07, adj=0, cex=0.7, "Difference of correlation before and after \n Mean = Median = 0.00, with sd = 0.01")
summary(M_compare$correlation_before - M_compare$correlation)
# save as "22-05-24_function_imputation_correlations_before_and_after_imputation.pdf"

Store imputed grassland function dataset.

saveRDS(Y, file="imputed_function_values.rds")
fwrite(Y, file="imputed_function_values.csv", sep = ";")
# clean a bit
imputed_grlfuns <- data.table::copy(Y)
rm(befun); rm(befunbackup); rm(corrbefun); rm(dataset); rm(imperr); rm(impvals)
rm(X); rm(Y); rm(p); rm(naloc); rm(mimperr)
rm(numcols); rm(tres)
rm(M_before); rm(M_after); rm(M_before_dt); rm(M_after_dt); rm(M_compare)

The multifunctionalities of imputed functions are calculated in another script. Current name calc_multifun_from_imputed.Rmd.



allanecology/BetaDivMultifun documentation built on Nov. 9, 2023, 8:47 p.m.