clustra_bp_vignette.R
In clustra: Clustering Longitudinal Trajectories

## ----setup--------------------------------------------------------------------
knitr::opts_knit$set(
  collapse = TRUE,
  comment = "#>"
)
cran = list(LibsData = TRUE, plot_true = TRUE, plot_raw = TRUE, run5 = TRUE, run5_v2 = FALSE, iter_plots = TRUE, plot5_side_by_side = FALSE, fig1 = FALSE, comp_true = FALSE, run10 = FALSE, plot10 = FALSE, run2 = FALSE, rand2 = FALSE, silplot = FALSE, silplot2 = FALSE, rand_plot = FALSE, hclust40 = FALSE, SAS_R_data = FALSE, Rand_SAS_R = FALSE, true_group = FALSE)
figs = list(LibsData = TRUE, plot_true = FALSE, plot_raw = FALSE, run5 = TRUE, run5_v2 = TRUE, iter_plots = FALSE, plot5_side_by_side = FALSE, fig1 = TRUE, comp_true = TRUE, run10 = TRUE, plot10 = TRUE, run2 = TRUE, rand2 = TRUE, silplot = TRUE, silplot2 = FALSE, rand_plot = TRUE, hclust40 = FALSE, SAS_R_data = TRUE, Rand_SAS_R = TRUE, true_group = TRUE)
all = list(LibsData = TRUE, plot_true = TRUE, plot_raw = TRUE, run5 = TRUE, run5_v2 = TRUE, iter_plots = TRUE, plot5_side_by_side = TRUE, fig1 = TRUE, comp_true = TRUE, run10 = TRUE, plot10 = TRUE, run2 = TRUE, rand2 = TRUE, silplot = TRUE, silplot2 = TRUE, rand_plot = TRUE, hclust40 = TRUE, SAS_R_data = TRUE, Rand_SAS_R = TRUE, true_group = TRUE)
chunk = cran  # chose which version of the vignette to run
full_data = FALSE # choose if full 80,000 id data or sample of 10,000 ids
mc = 2
# If running on a Unix or a Mac platform, set to number of available cores.
# This vignette will experience notable speedup up to about 10 cores.
# On Intel chips with hyperthreading, up to 2x available cores can be used.
if (.Platform$OS.type == "windows") mc = 1

data.table::setDTthreads(1) # manage data.table threads (increase with full_data)

## ----LibsData, warning=FALSE, eval = chunk$LibsData---------------------------
library(clustra)
start_time = deltime()
print(paste("Number of cores being used: mc parameter =", mc))

repo = "https://github.com/MVP-CHAMPION/"

repo_sas = paste0(repo, "clustra-SAS/raw/main/")
if(full_data) {
  url = paste0(repo_sas, "bp_data/simulated_data_27June2023.csv.gz")
  data = data.table::fread(url)
} else {
  data = bp10k
}
data$true_group = data$group
head(data)
plot_path = paste0(tempdir(), "/") # output dir for plots
cat("Resulting plots are saved in", plot_path, "directory\n")

## ----plot_true, fig.width = 7, fig.height = 9, eval = chunk$plot_true---------
set.seed(12345)
plot_sample(data, layout = c(3, 3), group = "true_group")

## ----plot_raw, fig.width = 7, fig.height = 7, eval = chunk$plot_raw-----------
plot_smooths(data, fits = NULL, max.data = 20000, group = "true_group")

## ----run5, eval = chunk$run5--------------------------------------------------
set.seed(12345)
cl5 = clustra(data, k = 5, maxdf = 30, conv = c(20, 0.5), mccores = mc, verbose = TRUE)

## ----run5_v2, eval = chunk$run5_v2--------------------------------------------
#  # What happens if we cut the iterations at 5 maximum?
#  set.seed(12345)
#  cl5.v2 = clustra(data, k = 5, maxdf = 30, conv = c(5, 0.5), mccores = mc)

## ----iter_plots, fig.width=7, fig.height=7, eval = chunk$iter_plots-----------
set.seed(12345)
clustra(data, k = 5, maxdf = 30, conv = c(10, 0.5), mccores = mc, verbose = 2, ylim = c(110, 170))

## ----plot5_side_by_side, fig.width = 7, fig.height = 5, warnings=FALSE, eval = chunk$plot5_side_by_side----
#  plot_smooths(data, fits = cl5$tps, max.data = 30000)
#  m = matrix(c(0.1, .95, 0.1, .95), nrow = 1)
#  sc = split.screen(m)
#  screen(1)
#    sc = split.screen(c(1, 2))
#    screen(2)
#      plot_smooths(data, fits = cl5$tps, max.data = 0, ylim = c(110, 180))
#    screen(3)
#      plot_smooths(data, fits = cl5.v2$tps, max.data = 0, ylim = c(110, 180))
#  close.screen(all.screens = TRUE)

## ----fig1, fig.width = 7, fig.height = 5, eval = chunk$fig1-------------------
#  png(paste0(plot_path, "R_cl5_plot.png"), width = 480, height = 360)
#      plot_smooths(data, fits = cl5$tps, max.data = 0, ylim = c(110, 180))
#  dev.off()
#  knitr::include_graphics(paste0(plot_path, "R_cl5_plot.png"))

## ----comp_true, eval = chunk$comp_true----------------------------------------
#  MixSim::RandIndex(cl5$data_group, data[, true_group])$AR
#  MixSim::RandIndex(cl5.v2$data_group, data[, true_group])$AR
#  MixSim::RandIndex(cl5$data_group, cl5.v2$data_group)$AR

## ----run10, eval = chunk$run10------------------------------------------------
#  set.seed(12345)
#  t = deltime()
#  cl10 = clustra(data, k = 10, maxdf = 30, conv = c(50, 0.5), mccores = mc)
#  deltimeT(t, paste("run time on", mc, "cores "))

## ----plot10, fig.width = 7, fig.height = 5, eval = chunk$plot10---------------
#  png(paste0(plot_path, "R_cl10_plot.png"), width = 480, height = 360)
#    plot_smooths(data, cl10$tps, max.data = 0, ylim = c(110, 180))
#  dev.off()
#  knitr::include_graphics(paste0(plot_path, "R_cl10_plot.png"))
#  MixSim::RandIndex(cl10$data_group, data[, true_group])$AR

## ----run2, fig.width = 7, fig.height = 5, eval = chunk$run2-------------------
#  set.seed(12345)
#  t = deltime()
#  cl2 = clustra(data, k = 2, maxdf = 30, conv = c(20,0.5), mccores=mc)
#  deltimeT(t, paste(" run time on", mc, "cores "))
#  cat(paste("Number of iterations completed:",cl2$iterations))
#  cat(paste("Number of individuals changing groups in last iteration:",cl2$changes))
#  png(paste0(plot_path, "R_cl2_plot.png"), width = 480, height = 360)
#    plot_smooths(data, cl2$tps, max.data = 0, ylim = c(110, 180))
#  dev.off()
#  knitr::include_graphics(paste0(plot_path, "R_cl2_plot.png"))
#  MixSim::RandIndex(cl2$data_group, data[, true_group])$AR

## ----rand2, comp.within, eval = chunk$rand2-----------------------------------
#  MixSim::RandIndex(cl5$data_group,cl2$data_group)$AR
#  MixSim::RandIndex(cl5$data_group,cl10$data_group)$AR
#  MixSim::RandIndex(cl10$data_group,cl2$data_group)$AR

## ----silplot, fig.width = 7, fig.height = 5, eval = chunk$silplot-------------
#  t = deltime()
#  
#  sil = clustra_sil(cl5, mccores = mc, conv=c(20,0.5))
#  png(paste0(plot_path, "R_cl5_silplot.png"), width = 480, height = 360)
#    lapply(sil, plot_silhouette)
#  dev.off()
#  set.seed(12345)
#  sil2 = clustra_sil(cl2, mccores = mc, conv=c(20,0.5))
#  png(paste0(plot_path, "R_cl2_silplot.png"), width = 480, height = 360)
#    lapply(sil2, plot_silhouette)
#  dev.off()
#  set.seed(12345)
#  sil10 = clustra_sil(cl10, mccores = mc, conv=c(20,0.5))
#  png(paste0(plot_path, "R_cl10_silplot.png"), width = 480, height = 360)
#    lapply(sil10, plot_silhouette)
#  dev.off()
#  knitr::include_graphics(paste0(plot_path, "R_cl5_silplot.png"))
#  knitr::include_graphics(paste0(plot_path, "R_cl2_silplot.png"))
#  knitr::include_graphics(paste0(plot_path, "R_cl10_silplot.png"))
#  deltimeT(t, paste(" silhouettes on", mc, "cores "))

## ----silplot2, fig.width = 7, fig.height = 5, eval = chunk$silplot2-----------
#  # Example running from the data step
#  t = deltime()
#  set.seed(12345)
#  sil = clustra_sil(data, k = c(2, 5, 10), mccores = mc, conv = c(50, 0.5))
#  lapply(sil, plot_silhouette)
#  deltimeT(t, paste("Silhouettes and clustra on", mc, "cores "))

## ----rand_plot, fig.width = 6.5, fig.height=6, eval = chunk$rand_plot---------
#  t = deltime()
#  set.seed(12345)
#  ran = clustra_rand(data, k = seq(2, 10, 1), starts = "random", mccores = mc,
#                     replicates = 10, conv=c(40,0.5), verbose = TRUE)
#  deltimeT(t, paste(" clustra_rand on", mc, "cores "))
#  rand_plot(ran, name = paste0(plot_path, "R_randplot.pdf")) # save pdf version
#  rand_plot(ran) # render png version

## -----------------------------------------------------------------------------
gpred = function(tps, newdata) {
  as.numeric(mgcv::predict.bam(tps, newdata, type = "response",
                               newdata.guaranteed = TRUE))
}

## ----hclust40, fig.width = 7, fig.height = 5, eval = chunk$hclust40-----------
#  set.seed(12345)
#  t = deltime()
#  cl40 = clustra(data, k = 40, maxdf = 30, conv = c(100,0.5), mccores = mc, verbose = TRUE)
#  timep = data.frame(time = seq(min(data$time), max(data$time), length.out = 100))
#  resp = do.call(rbind, lapply(cl40$tps, gpred, newdata = timep))
#  png(paste0(plot_path, "R_hclust_plot.png"), width = 720, height = 480)
#    plot(hclust(dist(resp)))
#  dev.off()
#  knitr::include_graphics(paste0(plot_path, "R_hclust_plot.png"))
#  deltimeT(t, paste("clustra k = 40, conv = c(100, 0.5) on", mc, "cores + hclust + dist "))

## ----SAS_R_data, fig.width = 7, fig.height = 5, eval = chunk$SAS_R_data-------
#  sas.file = function(file, rp = repo_sas) paste0(rp, "sas_results/", file)
#  
#  sas_r_cplot = function(file, clustra_out) {
#    sas = cbind(haven::read_sas(sas.file(file)), source = 1)
#    r = data.frame(time = min(sas$time):max(sas$time))
#    n = nrow(r)
#    k = nrow(sas)/n
#    ## below assumes same time in each group and groups are contiguous.
#    rpred = do.call(c, parallel::mclapply(clustra_out$tps, gpred, newdata = r,
#                                          mc.cores = mc))
#  
#    ## compute mapping from SAS to R using manhattan distance over time
#    rmap = vector("integer", k)
#    m = as.matrix(
#      dist(rbind(t(matrix(sas$pred, n)),
#                 t(matrix(rpred, nrow = n))), "manhattan"))[1:k, (k + 1):(2*k)]
#    for(i in 1:k) { # assign starting with closest pair
#      ind = arrayInd(which.min(m), dim(m))
#      rmap[ind[1]] = ind[2]
#      m[ind[1], ] = Inf
#      m[, ind[2]] = Inf
#    }
#  
#    r = cbind(r, group = sas$group, pred = rpred, source = 2)
#    plot(pred ~ time, data = sas, type = "n")
#    for(i in 1:k) {
#      sr = (1 + (i - 1)*n):(i*n)
#      rr = (1 + (rmap[i] - 1)*n):(rmap[i]*n)
#      lines(sas[sr, ]$time, sas[sr, ]$pred, lty = sas[sr, ]$source, col = i)
#      lines(r[rr, ]$time, r[rr, ]$pred, lty = r[rr, ]$source, col = i)
#    }
#    rmap # output the mapping from SAS clusters to R: R = rmap[SAS]
#  }
#  png(paste0(plot_path, "R_SAS_cl2_compare.png"), width = 480, height = 360)
#    map2 = sas_r_cplot("graphout_cl2.sas7bdat", cl2)
#  dev.off()
#  png(paste0(plot_path, "R_SAS_cl5_compare.png"), width = 480, height = 360)
#    map5 = sas_r_cplot("graphout_cl5.sas7bdat", cl5)
#  dev.off()
#  png(paste0(plot_path, "R_SAS_cl10_compare.png"), width = 480, height = 360)
#    map10 = sas_r_cplot("graphout_cl10.sas7bdat", cl10)
#  dev.off()
#  knitr::include_graphics(paste0(plot_path, "R_SAS_cl2_compare.png"))
#  knitr::include_graphics(paste0(plot_path, "R_SAS_cl5_compare.png"))
#  knitr::include_graphics(paste0(plot_path, "R_SAS_cl10_compare.png"))

## ----Rand_SAS_R, eval = chunk$Rand_SAS_R--------------------------------------
#  MixSim::RandIndex(haven::read_sas(sas.file("rms_cl2.sas7bdat"))$newgroup, cl2$group)
#  MixSim::RandIndex(haven::read_sas(sas.file("rms_cl5.sas7bdat"))$newgroup, cl5$group)
#  MixSim::RandIndex(haven::read_sas(sas.file("rms_cl10.sas7bdat"))$newgroup, cl10$group)

## ----true_group, eval = chunk$true_group--------------------------------------
#  true_group = data[, mean(true_group),by=id]
#  MixSim::RandIndex(haven::read_sas(sas.file("rms_cl5.sas7bdat"))$newgroup, true_group$V1)
#  MixSim::RandIndex(true_group$V1, cl5$group)

## ----echo=FALSE---------------------------------------------------------------
deltimeT(start_time, "clustra vignette run time ")