knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
suppressMessages({ library(dendextend) library(randomForestSRC) library(SHT) library(mice) library(stringr) library(relate) })
set.seed(123) #load simulated dataset from the package data("cohort_na_df") cohort_na_df[199:202,] #save the true distribution true_distribution <- cohort_na_df$distribution cohort_na_df <- cohort_na_df %>% dplyr::select(-distribution) # examine missing pattern (e.g., some cohorts did not collect specific information) cohort_na_df %>% find_na_level(clusterid = "cohortid") #add random missingness in addition to the systematic missingness for a more realistic scenario random_missing_row <- sample(1:nrow(cohort_na_df), floor(nrow(cohort_na_df)/2)) missing_col <- c(1,8,13,14) cohort_na_df[random_missing_row, missing_col] <- NA # examine missing pattern after adding random missingness cohort_na_df %>% find_na_level(clusterid = "cohortid")
analytic.rfsrc <- rfsrc.fast(cohortid ~ ., #regress cohort id on all other variables data = cohort_na_df, distance=T, ntree=1500, nsplit=3, na.action="na.impute",nimpute=1) # impute within nodes distance.matrix <- analytic.rfsrc$distance # extract distance matrix
#calculate average distance matrix dist_avg_mat <- avg_dist_func(distance.matrix, cohort_na_df$cohortid) #cluster average distance matrix and #identify which cohorts are clustering together on dendogram dmean <- as.dist(dist_avg_mat) #hierarchical clustering hc2 <- hclust(dmean, method = "ward.D2") dend <- as.dendrogram(hc2) plot(dend) table(true_distribution, cohort_na_df$cohortid, dnn = c("True Cluster","Cohort ID"))
We generally do not impute the missing variables that are completely missing or missing more than 50%.
The relate::mice_group_impute
in this package provides imputation by group regardless of missingness level (imputeLevel = 1).
Please find miceadds::mice.impute.bygroup
for similar functionality.
mice_group_impute(df = cohort_na_df, clusterid = "cohortid", imputeLevel = 1, miceArg = list(method = 'mean', maxit = 1))
# we use mean imputation and asymptotic BG test for illustrative purposes. ll <- recursive.test(dend = dend, df = cohort_na_df, cohortid.var = "cohortid", verbose = T, saveIntermediate = F, BG.method = 'permutation', n_perm = 100, impute = T, miceArgs = list(method = c("mean"), m =1, maxit = 5, seed = 123, printFlag = F)) ll$clusters table(true_distribution, cohort_na_df$cohortid, dnn = c("True Cluster","Cohort ID"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.