#TODO : tune the mtry parameter (probably around 20 would be good) #TODO : calculate the assembled functions AFTER imputation #TODO : recheck negative values #TODO : note that the variable-wise error depends on the scale --> keep in mind # note : impute response and explanatory separately
This script :
- uses raw functions from the synthesis dataset (loaded with analysis_nonpublic.R
, variable raw_grlfuns
)
- note if taken from bexis, first run the reformatting script bexis_to_wide_format.R
attached to the dataset.
- The grassland functions data used for gapfilling is stored in raw_grlfuns
, selected variables according to info data. raw_grlfuns
is loaded by analysis_nonpublic.R
. Make sure you also run the selection of functions for gapfilling!
- performs gap-filling
- saves the gap-filled dataset
- required input : info_data
, raw_grlfuns
, plotNAset
(the plot set defined by diversity data in calc_betadiversities.Rmd
with the final set of included plots for analysis)*
- output : generates gap-filled dataset impdat
, and imp_err
contianing the imputation error. Two variables are produced, impdat
which contains the gapfilled dataset which will be used in further analysis as well as imp_err
, which contains the imputation error automatically estimated by missForest
. The imputed dataset is stored as .rds file.
sections_to_be_loaded <- c("assemble_functions") source("vignettes/analysis_nonpublic.R")
The dataset used for gap-filling is larger than the dataset used for the analysis. More variables enhance the performance of the gap-filling. Note : Only use additional variables which are not used in the analysis. Otherwise, data can be circular. (This means that e.g. no soil data is used for imputation, but additional functions which are not included in the later analysis).
befun <- data.table::copy(raw_grlfuns)
chosing the plot set
befun <- befun[Plotn %in% plotNAset]
cleaning out variables with too many missing values
visdat::vis_miss(befun, sort_miss = T)
Values with over 21% missing data could be excluded here (mAMF hyphae, Parasitoid traps, Aggregation, PRI.2011 and NaHCO3.Pi).
Here, values with too many missings are not taken out yet. Instead, the imputation is done and the variables are only removed in case their imputation error is too high.
#USER : set treshold for missing values treshold <- 0.21 t <- apply(befun, 2, function(x) sum(is.na(x))) exclude <- names(which(t > 150 * treshold)) befun <- befun[, !colnames(befun) %in% exclude, with=F] rm(t); rm(exclude); rm(treshold)
If the missing plot from 16S, amoA_AOA.2016 & Co is taken out, 3 more functions can be included in the functions set.
# befun[is.na(`16S_NB`), Plotn] # befun[is.na(amoA_AOA.2016), Plotn] # befun[is.na(amoA_AOB.2016), Plotn] # visdat::vis_miss(befun[, .(`16S_NB`, amoA_AOA.2016, amoA_AOB.2016, amoA_AOA.2011)], sort_miss = T) # The missing plot is "HEG31" befun <- befun[Plotn != "HEG31"]
update the helper vector plotNAset.rds
and take out the plot "HEG31"
# readRDS("plotNAset.rds") plotNAset <- plotNAset[-which(plotNAset == "HEG31")] saveRDS(plotNAset, "plotNAset.rds")
Of the non-factor columns only
corrbefun <- befun[, !colnames(befun) %in% c("Explo", "Plotn", "Plot"), with=F] M <- cor(corrbefun, use="pairwise.complete.obs") M_before <- M # save for later comparison with M_after # raw_grlfun_corrplot1 corrplot::corrplot(M,type="lower",addCoef.col = "black",method="color",diag=F, tl.srt=1, tl.col="black", mar=c(0,0,0,0), number.cex=0.25, tl.cex = 0.3) # raw_grlfun_corrplot2 corrplot::corrplot(M, type = "upper", tl.col="black", tl.srt=90, diag = F, tl.cex = 0.5, mar = c(2, 0, 0, 0)) # Supplementary figure XY # only show high correlations tres <- 0.7 M[which(M < tres & M > -tres)] <- 0 corrplot::corrplot(M, type = "upper", tl.col="black", tl.srt=40, diag = F, tl.cex = 0.3)
Network
# show network - is there a variable "alone"? bzw. isolated? network <- igraph::graph_from_adjacency_matrix(M, weighted=T, mode="undirected", diag=F) plot(network)
There is no variable which is not connected with at least one line to another variable. Lines represent correlations > threshold.
rm(M); rm(network)
before changing values, store the original dataset
befunbackup <- data.table::copy(befun)
remove negative values, remember to add again after imputation!
find negative values
par(mar = c(10, 1, 3, 1)) boxplot(corrbefun, las=2) # minimum of the numeric columns negative <- colnames(corrbefun)[which(corrbefun[, lapply(.SD, function(x) min(x, na.rm=T))] < 0)]
remove negative values - shift whole vector up.
# DEA.inverted mindea <- min(befun$DEA.inverted, na.rm=T) befun$DEA.inverted <- befun$DEA.inverted + abs(mindea) # NO3.2014 minno <- min(befun$NO3.2014, na.rm=T) befun$NO3.2014 <- befun$NO3.2014 + abs(minno) # dung.removal mindung <- min(befun$dung.removal, na.rm=T) befun$dung.removal <- befun$dung.removal + abs(mindung) rm(negative)
Log transformation of the numeric columns. The natural logartithm of values smaller than 1 is negative. Therefore, all values are shifted by 1 again to avoid any negative values in the log transformed dataset.
numcols <- colnames(befun)[!colnames(befun) %in% c("Explo", "Plotn", "Plot")] befun[, (numcols) := lapply(.SD, as.numeric), .SDcols = numcols] # log transform the values + 1, later shift them back. befun[, (numcols) := lapply(.SD, function(x) log(x+1)), .SDcols = numcols] # record NA value positions for visualistation naloc <- is.na(befun)
# recoding Explo as factor for imputation befun[, Explo := as.factor(Explo)] dataset <- befun[, !colnames(befun) %in% c("Plotn", "Plot"), with=F]
By default square root of p = number of columns. Try values between 2 and 50 to tune the OOB error. Find the minimum of error and use the according mtry parameter value. Around 20 worked well in previous projects.
source script : https://github.com/eric-allan/BE.RE.analysis/blob/master/EF_select_suitable_functions.R by Caterina Penone
# note : only run once, then work with the output dataset. maxiteration <- 50 sqrt(ncol(befun)) OOB_NRMSE <- c() i <- 1 for (mtry in 2:maxiteration) { imp.matX <- missForest::missForest(dataset, mtry = mtry) OOB_NRMSE[i] <- imp.matX$OOBerror[1] i <- i + 1 print(i) } rm(i) write.table(OOB_NRMSE, file = "function_imputation_parameter_tuning_mtry.csv", sep = ";")
OOB_NRMSE <- read.table(file = paste(pathtodata, "/data_assembly/output_data/function_imputation_parameter_tuning_mtry.csv", sep = ""), sep = ";") OOB_NRMSE <- OOB_NRMSE$x pdf(file = "function_imputation_parameter_tuning_mtry.pdf", paper = "a4r") plot(seq(2, maxiteration), OOB_NRMSE, type = "l", ylab = "OOB", xlab = "mtry") abline(v = which(OOB_NRMSE == min(OOB_NRMSE)) + 1, lty = "dashed", col = "orange") abline(v = which(OOB_NRMSE[1:30] == min(OOB_NRMSE[1:30])) + 1, lty = "dashed", col = "red") dev.off() which(OOB_NRMSE == min(OOB_NRMSE)) + 1 # optimal mtry for explanatory is 28
Absolute minimum would be to use 45, but higher mtry means more computational resources. Find the local minimum around 20, not too high compared to abs minimu. thus use mtry = 21.
default is 100, increasing ntree can decrease the imputation error. source script : https://github.com/eric-allan/BE.RE.analysis/blob/master/EF_select_suitable_functions.R by Caterina Penone
mtry_val <- 22 OOB_NRMSE <- c() i <- 1 for(ntree in c(100, 110, 120, 130, 180, 200)) { imp.matX <- missForest::missForest(dataset, mtry = mtry_val, ntree = ntree) OOB_NRMSE[i] <- imp.matX$OOBerror[1] i <- i + 1 print(i) } OOB_NRMSE <- data.table(ntree = c(100, 110, 120, 130, 180, 200), OOB = OOB_NRMSE) write.table(OOB_NRMSE, file = "function_imputation_parameter_tuning_ntree.csv", sep = ";") rm(i); rm(ntree)
Visualise
OOB_NRMSE <- read.table(file = paste(pathtodata, "/data_assembly/output_data/function_imputation_parameter_tuning_ntree.csv", sep = ""), sep = ";") OOB_NRMSE <- data.table(OOB_NRMSE) OOB_NRMSE[which(OOB_NRMSE$OOB == min(OOB_NRMSE$OOB)), ntree] # 130 pdf("function_imputation_parameter_tuning_ntree.pdf") plot(OOB_NRMSE$ntree, OOB_NRMSE$OOB, type = "l", ylab = "OOB", xlab = "ntree") abline(v = OOB_NRMSE[which(OOB_NRMSE$OOB == min(OOB_NRMSE$OOB)), ntree], lty = "dashed", col = "orange") dev.off()
ntree = 130 (not so much better, but still)
The imputation is repeated 50 times, and the mean of the imputed values is taken.
# 50 imputations impvals <- list() imperr <- list() # based on one-time imputation, take out : "amoA_AOA.2016" ,"amoA_AOB.2016", "nxrA_NS", "16S_NB" numcols <- numcols[which(!numcols %in% c("Plotn", "Plot"))] # recoding Explo as factor for imputation befun[, Explo := as.factor(Explo)] dataset <- befun[, !colnames(befun) %in% c("Plotn", "Plot"), with=F]
for(i in 1:50){ print(i) current <- missForest::missForest(dataset, variablewise = T, mtry = 21, ntree = 130) imperr[[i]] <- data.table::data.table("columns" = colnames(dataset), "errors" = current$OOBerror) current <- data.table::as.data.table(current$ximp) # re-transform the numeric variables current[, (numcols) := lapply(.SD, function(x) (exp(x)-1)), .SDcols = numcols] # re-transform the negative values current$DEA.inverted <- current$DEA.inverted - abs(mindea) current$NO3.2014 <- current$NO3.2014 - abs(minno) current$dung.removal <- current$dung.removal - abs(mindung) # add back the "Plotn" column which was taken out for imputation current[, Plotn := befun$Plotn] # convert imputed data.table to matrix, as more handy for imputed values handling impvals[[i]] <- current } rm(current); rm(i); rm(mindea); rm(minno); rm(mindung) # USER : may change filename saveRDS(impvals, file="raw_imputed_function_values_complete.rds") saveRDS(imperr, file = "raw_imputed_function_errors_complete.rds")
impvals <- readRDS(paste(pathtodata, "/data_assembly/output_data/raw_imputed_function_values_complete.rds", sep = "")) imperr <- readRDS(paste(pathtodata, "/data_assembly/output_data/raw_imputed_function_errors_complete.rds", sep = ""))
Check imputation error
imperr <- do.call(rbind, imperr) mimperr <- data.table::as.data.table(aggregate(errors ~ columns, imperr, mean)) mean(mimperr[,errors]) # 0.06827009 mean(imperr[errors != 0, errors]) # 0.1289546 mean of only the imputed values # barplot(height=mimperr[,errors], names.arg = mimperr[, columns]) library(cowplot) p <- ggplot2::ggplot(data=mimperr, ggplot2::aes(x=columns, y=errors)) + ggplot2::geom_bar(stat="identity") + theme(axis.text.x=element_text(color = "black", size=6, angle=30, vjust=.8, hjust=0.8)) + geom_hline(yintercept=0.25, linetype="dashed", color = "gray") + ylab("OOB imputation error") p
The two variables Total pollinators and Parasitoid traps are removed, because the imputation error is too high and because they rely on diversity data (which is hard / not possible to impute). The other variables with imputation errors over 25% are left, because only 1-2 values are missing in those datasets. Also, the variablewise imputation error depends on the variable scale.
dataset[, c("Total.pollinators", "Parasitoid.traps") := NULL] numcols <- numcols[-which(numcols %in% c("Total.pollinators", "Parasitoid.traps"))]
pdf("imputation_errors.pdf", paper = "a4r") cowplot::plot_grid(naniar::vis_miss(dataset, sort_miss = T), p, nrow = 2) dev.off() #TODO might do nicer plot : not all labels are readable
Get together imputed dataset from 50 imputations.
# backup <- impvals X <- do.call(rbind, impvals) X <- data.table::as.data.table(X) X[, (numcols) := lapply(.SD, as.numeric), .SDcols=numcols] # backup2 <- data.table::copy(X) # two ways of aggregating the 50 imputed values together # Y <- aggregate(X[, ..numcols], list(Plotn=X$Plotn), mean) Y <- X[, lapply(.SD, mean, na.rm=T), .SDcols=numcols, by=Plotn] Y <- data.table::data.table(Y)
Compare imputed values with real values. Creates the pdf imputed_values.pdf
pdf("imputed_values.pdf", paper="a4r") par(mfrow = c(2,7)) for(i in 1:length(numcols)){ x <- rep(1, length(plotNAset)) colname <- numcols[i] # length of the imputed value vector rowlocations <- naloc[, colname] y <- Y[rowlocations, get(colname)] plot(x, befunbackup[, get(colname)], xlab="", ylab="range of values", main=colname, sub=paste(length(y), "NAs")) x <- x[0:length(y)] points(x, y, col="red",pch=19) } dev.off()
Check correlations after imputation
corrbefun <- Y[, !colnames(Y) %in% c("Plotn", "Explos"), with=F] M_after <- cor(corrbefun, use="pairwise.complete.obs", ) # 21-12-07_function_imputation_correlations_among_numeric_variables_after_imputation corrplot::corrplot(M_after, type = "upper", tl.col="black", tl.srt=90, diag = F, tl.cex = 0.5, mar = c(2, 0, 0, 0)) # Potential Supplementary figure XY (compare with before imputation) # only show high correlations tres <- 0.7 M_after[which(M_after < tres & M_after > -tres)] <- 0 corrplot::corrplot(M_after, type = "upper", tl.col="black", tl.srt=40, diag = F, tl.cex = 0.3) # compare correlations before and after # filter M by columns which are still there in M_after h <- as.matrix(M_before) h[!lower.tri(h)] <- 1000; h <- reshape2::melt(h, value.name = "correlation"); h <- data.table(h); h <- h[!correlation == 1000, ] M_before_dt <- h setnames(M_before_dt, old = "correlation", new = "correlation_before") h <- as.matrix(M_after) h[!lower.tri(h)] <- 1000; h <- reshape2::melt(h, value.name = "correlation"); h <- data.table(h); h <- h[!correlation == 1000, ] M_after_dt <- h M_compare <- merge(M_before_dt, M_after_dt, by = c("Var1", "Var2")) plot(M_compare$correlation_before, M_compare$correlation, xlab = "correlations before imputation", ylab = "correlations after imputation") mtext(side=3, line=2, at=-0.07, adj=0, cex=0.7, "Difference of correlation before and after \n Mean = Median = 0.00, with sd = 0.01") summary(M_compare$correlation_before - M_compare$correlation) # save as "22-05-24_function_imputation_correlations_before_and_after_imputation.pdf"
Store imputed grassland function dataset.
saveRDS(Y, file="imputed_function_values.rds") fwrite(Y, file="imputed_function_values.csv", sep = ";") # clean a bit imputed_grlfuns <- data.table::copy(Y) rm(befun); rm(befunbackup); rm(corrbefun); rm(dataset); rm(imperr); rm(impvals) rm(X); rm(Y); rm(p); rm(naloc); rm(mimperr) rm(numcols); rm(tres) rm(M_before); rm(M_after); rm(M_before_dt); rm(M_after_dt); rm(M_compare)
The multifunctionalities of imputed functions are calculated in another script. Current name calc_multifun_from_imputed.Rmd
.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.