# imputeTraits <- function(dataset, phylogeny, correlationsTraitsResults, varianceResults = NULL, imputationVariables, orderCriterium = NULL, numberOfPhyloCoordinates = 5,
# predictors = NULL, prodNAs = 0, IterationsNumber = 10, clustersNumber = 2, forceRun = T, parallelization = T){
#
# ### Objects to gather results ####
#
# predictivePerformance.all <- data.frame()
# ximp.all <- data.frame()
#
# predictivePerformance.all2 <- data.frame()
# ximp.all2 <- data.frame()
#
# imputationResults <- list()
#
# modelName <- paste("(", paste(imputationVariables, collapse = " <-> "), ") <- ", paste(predictors, collapse = ", "), paste("numberPhyloCoord_", numberOfPhyloCoordinates))
#
# #### PHYLOGENETIC PRNCIPAL COMPONENTS ------------------------------------------ ####
#
# ### Phylogenetic principal coordinates ####
#
# if (!file.exists(paste0(outputs.dir, "phylo_eigenvectors.csv")) | isTRUE(forceRun)) {
# mat <- ape::cophenetic.phylo(phylogeny)
# mat <- as.data.frame(mat)
# pCordA <- ape::pcoa(mat)
# phyloEigenV <- as.data.frame(pCordA$vectors)
# phyloEigenV <- phyloEigenV[dataset$taxon, c(1:50)]
# phyloEigenV$taxon <- rownames(phyloEigenV)
# imp.dataset <- merge(dataset, phyloEigenV, by = "taxon", all.x = T)
# names(imp.dataset) <- stringr::str_replace_all(names(imp.dataset), "Axis.", "Phylo_axis_")
# utils::write.csv(imp.dataset, paste0(outputs.dir, "phylo_eigenvectors.csv"),
# row.names = F)
# print(paste0(outputs.dir, "phylo_eigenvectors.csv"))
# } else {
# imp.dataset <- utils::read.csv(paste0(outputs.dir, "phylo_eigenvectors.csv"),
# header = T)
# print("loading previously calculated phylogenetic eigenvectors")
# }
#
# ### Imputation order ####
#
# # Order traits according to the covariance structure
#
# if(!is.null(orderCriterium)){
# correlationsTraitsResults$order <- abs(correlationsTraitsResults[, orderCriterium])
# phylo_order <- correlationsTraitsResults %>%
# dplyr::arrange(dplyr::desc(order)) %>%
# dplyr::select(c(Trait_1, Trait_2, orderCriterium))
#
# imputation.variables <- character()
# for (i in 1:length(phylo_order[, 1])) {
# impVars <- phylo_order[i, ] %>% dplyr::select(Trait_1, Trait_2)
# impVars <- c(impVars$Trait_1, impVars$Trait_2)
# impVars <- impVars[!impVars %in% imputation.variables]
# imputation.variables <- c(imputation.variables, impVars)
# }
# # Order the first two traits according to their phylogenetic variance
# if(!is.null(varianceResults)){
# firstTwoOrder <- varianceResults %>%
# dplyr::filter(Trait %in% imputation.variables[1:2]) %>%
# dplyr::arrange(dplyr::desc(abs(Total_phylogenetic_conservatism))) %>%
# dplyr::pull(Trait)
#
# imputation.variables[1:2] <- firstTwoOrder
# }
# } else {
# imputation.variables <- sample(imputationVariables)
# }
#
# imputationVariables <- imputation.variables[imputation.variables %in% imputationVariables]
#
# # Imputation dataset
#
# predictors <- c(predictors, c(paste0("Phylo_axis_", 1:numberOfPhyloCoordinates)))
#
# imp.dataset <- imp.dataset %>% dplyr::select(c(imputationVariables, predictors))
# xTrue <- imp.dataset
#
# ### IMPUTATION 1. Run models to impute variables following the previously determined order ####
#
# for (n in 1:IterationsNumber) {
#
# imp.dataset[, imputationVariables] <- missForest::prodNA(as.data.frame(xTrue[, imputationVariables]), prodNAs)
# cl <- parallel::makeCluster(clustersNumber)
# doParallel::registerDoParallel(cl)
# if (prodNAs != 0) {
# rfImp.res <- randomForestImpute(xmis = as.matrix(imp.dataset),
# maxiter = 50, ntree = 100, parallelize = parallelization,
# xtrue = as.matrix(xTrue))
#
# # R2 calculation
# r2.var <- numeric()
# for(impVariable in imputationVariables){
# matObs <- as.matrix(xTrue[, impVariable])
# matNA <- as.matrix(imp.dataset[, impVariable])
#
# observed <- matObs[which(!is.na(matObs) & is.na(matNA))]
#
# predicted <- as.matrix(rfImp.res$ximp[, impVariable])
# predicted <- predicted[which(!is.na(matObs) & is.na(matNA))]
#
# r2.var[impVariable] <- cor(observed, predicted)^2
# }
#
# predictivePerformance <- data.frame(Variable = imputationVariables,
# N = length(imp.dataset[, 1]),
# N_Obs = length(imp.dataset[which(!is.na(imp.dataset[, impVariable])), 1]),
# N_NA = length(imp.dataset[which(is.na(imp.dataset[, impVariable])), 1]),
# NRMSE = rfImp.res$OOBerror[1:length(imputationVariables)],
# R2 = r2.var,
# Model = modelName)
# } else {
# rfImp.res <- randomForestImpute(xmis = as.matrix(imp.dataset),
# maxiter = 50, ntree = 1000, parallelize = parallelization)
#
# predictivePerformance <- data.frame(Variable = imputationVariables,
# N = length(imp.dataset[, 1]),
# N_Obs = length(imp.dataset[which(!is.na(imp.dataset[, impVariable])), 1]),
# N_NA = length(imp.dataset[which(is.na(imp.dataset[, impVariable])), 1]),
# NRMSE = rfImp.res$OOBerror[1:length(imputationVariables)],
# Model = modelName)
# }
# parallel::stopCluster(cl)
#
# row.names(predictivePerformance) <- NULL
# print(predictivePerformance)
#
# ## Results of the first round of imputation (gap filling) per iteration
#
# ximp <- as.data.frame(rfImp.res$ximp)
# ximp$taxon <- dataset$taxon
# predictivePerformance.all <- rbind(predictivePerformance.all, predictivePerformance)
# ximp.all <- rbind(ximp.all, ximp)
# }
#
# ### Results aggregation (for all iterations)
#
# imputationResults$ximp <- stats::aggregate(ximp.all[, -which(names(ximp) == "taxon")], by = list(ximp.all$taxon), FUN = mean) %>% dplyr::rename(taxon = Group.1)
# imputationResults$predictivePerformance <- stats::aggregate(predictivePerformance.all[, -c(which(names(predictivePerformance.all) == "Variable"), which(names(predictivePerformance.all) == "Model"))],
# by = list(predictivePerformance.all$Variable), FUN = mean) %>%
# dplyr::rename(Variable = Group.1) %>%
# dplyr::mutate(Model = modelName)
#
# if (IterationsNumber > 1) {
# imputationResults$ximp_sd <- stats::aggregate(ximp.all[, -which(names(ximp) == "taxon")], by = list(ximp.all$taxon), FUN = stats::sd) %>%
# dplyr::rename(taxon = Group.1)
#
# imputationResults$predictivePerformance_sd <- stats::aggregate(predictivePerformance.all[, -c(which(names(predictivePerformance.all) == "Variable"), which(names(predictivePerformance.all) == "Model"))],
# by = list(predictivePerformance.all$Variable), FUN = stats::sd) %>% dplyr::rename(Variable = Group.1) %>%
# dplyr::mutate(Model = modelName)
# imputationResults$ximp_all_iterations <- ximp.all
# imputationResults$predictivePerformance_all_iterations <- predictivePerformance.all
# }
#
# ### IMPUTATION 2. Run models to impute variables following the previously determined order using imputed traits (except for the one being imputed) ####
#
# imp.dataset2 <- xTrue
# ximp2 <- as.data.frame(imputationResults$ximp)
#
# for(impVariable in imputationVariables){
#
# for (n in 1:IterationsNumber) {
# ximp2[, impVariable] <- missForest::prodNA(as.data.frame(imp.dataset2[, impVariable]), prodNAs)
#
# ximp2 <- ximp2 %>% dplyr::select(-taxon)
#
# cl <- parallel::makeCluster(clustersNumber)
# doParallel::registerDoParallel(cl)
# if (prodNAs != 0) {
# rfImp.res2 <- randomForestImpute(xmis = as.matrix(ximp2),
# maxiter = 50, ntree = 1000, parallelize = parallelization,
# xtrue = as.matrix(xTrue))
#
#
# # R2 calculation
# matObs <- as.matrix(xTrue[, impVariable])
# matNA <- as.matrix(ximp2[, impVariable])
#
# observed <- matObs[which(!is.na(matObs) & is.na(matNA))]
#
# predicted <- as.matrix(rfImp.res$ximp[, impVariable])
# predicted <- predicted[which(!is.na(matObs) & is.na(matNA))]
#
# r2.var <- cor(observed, predicted)^2
#
# predictivePerformance <- data.frame(Variable = impVariable,
# N = length(ximp2[, 1]),
# N_Obs = length(ximp2[which(!is.na(ximp2[, impVariable])), 1]),
# N_NA = length(ximp2[which(is.na(ximp2[, impVariable])), 1]),
# NRMSE = rfImp.res2$OOBerror[1:length(impVariable)],
# R2 = r2.var,
# Model = modelName)
# } else {
# rfImp.res2 <- randomForestImpute(xmis = as.matrix(ximp2),
# N = length(ximp2[, 1]),
# N_Obs = length(ximp2[which(!is.na(ximp2[, impVariable])), 1]),
# N_NA = length(ximp2[which(is.na(ximp2[, impVariable])), 1]),
# maxiter = 50, ntree = 1000, parallelize = parallelization)
#
# predictivePerformance <- data.frame(Variable = impVariable,
# NRMSE = rfImp.res2$OOBerror[1:length(impVariable)],
# Model = modelName)
# }
# parallel::stopCluster(cl)
#
# ## Results of the second round of imputation (gap filling) per iteration
#
# row.names(predictivePerformance) <- NULL
# print(predictivePerformance)
# ximp2[, impVariable] <- as.data.frame(rfImp.res2$ximp[, impVariable])
# ximp2$taxon <- dataset$taxon
# predictivePerformance.all2 <- rbind(predictivePerformance.all2, predictivePerformance)
# ximp.all2 <- rbind(ximp.all2, ximp2)
#
# imp.dataset2[, impVariable] <- as.data.frame(rfImp.res2$ximp[, impVariable])
# }
# }
#
# ### Results aggregation (for all iterations)
#
# imputationResults$ximp2 <- stats::aggregate(ximp.all2[, -which(names(ximp.all2) == "taxon")], by = list(ximp.all2$taxon), FUN = mean) %>% dplyr::rename(taxon = Group.1)
# imputationResults$predictivePerformance2 <- stats::aggregate(predictivePerformance.all2[, -c(which(names(predictivePerformance.all2) == "Variable"), which(names(predictivePerformance.all2) == "Model"))],
# by = list(predictivePerformance.all2$Variable), FUN = mean) %>%
# dplyr::rename(Variable = Group.1) %>%
# dplyr::mutate(Model = modelName)
#
# if (IterationsNumber > 1) {
# imputationResults$ximp_sd <- stats::aggregate(ximp.all2[, -which(names(ximp) == "taxon")], by = list(ximp.all2$taxon), FUN = stats::sd) %>%
# dplyr::rename(taxon = Group.1)
#
# imputationResults$predictivePerformance_sd2 <- stats::aggregate(predictivePerformance.all2[, -c(which(names(predictivePerformance.all2) == "Variable"), which(names(predictivePerformance.all2) == "Model"))],
# by = list(predictivePerformance.all2$Variable), FUN = stats::sd) %>% dplyr::rename(Variable = Group.1) %>%
# dplyr::mutate(Model = modelName)
# imputationResults$ximp_all_iterations2 <- ximp.all2
# imputationResults$predictivePerformance_all_iterations2 <- predictivePerformance.all2
# }
#
# return(imputationResults)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.