If your site is one of the "train" sites, please use this workflow to establish the topology and trajectories using the observations at your site.
library(tdapseudotime) library(mice) library(dplyr) library(tidyr) library(ggplot2) set.seed(1234)
theme_set( theme_bw() + theme( panel.grid.minor = element_blank(), legend.title = element_blank() ))
Parameters:
my_file <- 'train-data.csv' # TDAMapper parameters (see grid search below) n_intervals <- 6 p_overlaps <- 60 n_clusts <- 8
Color palette for enrichment: blue > green > yellow > orange > red
# RColorBrewer::brewer.pal(length(unique(CommunityCluster$membership)), "Set1") my_colors <- c("#00A3DD", "#60C659", "#FFBC21", "#FF7F1E", "#EF2B2D")
Create a timeline for each subject (being time zero the first observation). Extract lab values to be used for TDA distance matrix.
Check format: - first col: row id "id" (can be any value, i.e. unique row number) - second col: pts id "covid_id" - third col: date "day"
Need to check whether rowid is important
FupData <- read.csv(my_file, header = TRUE) %>% widen_i2b2() non_lab_value_names <- c('id', 'covid_id', 'CardiacTroponinHighSensitivity') lab_value_names <- setdiff(names(FupData), non_lab_value_names) processed_data <- mutate_at(FupData, dplyr::all_of(lab_value_names), as.numeric) dim(processed_data)
mm <- 5 lab_values_mat <- processed_data[, lab_value_names] %>% mice(m = mm, maxit = 25, meth = 'pmm', seed = 500) %>% # imputation # complete(1) %>% # scale() %>% # prepare for cosine similarity calculation {.} lab_values_mat <- lapply(1:mm, function(x) complete(lab_values_mat, x)) %>% do.call(rbind, .) %>% as.matrix() %>% scale()
The code in this section is optional.
# intervals_seq <- seq(3, 11, 1) # overlaps <- 60 # clusts <- 8 # graph_grid <- list() # # for (ii in seq_along(intervals_seq)){ # print(ii) # intervals <- intervals_seq[ii] # f_sim_map <- map_tda(lab_values_mat, # num_intervals = c(intervals, intervals), # percent_overlap = overlaps, # num_bins_when_clustering = clusts) # f_graph <- igraph::graph.adjacency(f_sim_map$adjacency, mode = "undirected") # # graph_grid[[ii]] <- c(intervals = intervals, overlaps = overlaps, # clust_bins = clusts, # get_graph_properties(f_graph, f_sim_map)) # } # graph_grid <- do.call("rbind", graph_grid) %>% # as.data.frame() # # graph_grid %>% # select(- overlaps, - clust_bins) %>% # pivot_longer(-intervals) %>% # ggplot(aes(x = intervals, y = value)) + # geom_point() + # scale_x_continuous(breaks = intervals_seq) + # facet_wrap(~ name, scales = 'free_y')
# intervals_seq <- seq(3, 11, 1) # # overlaps_seq <- seq(50, 100, 10) # # clust_bins <- c(6, 8, 10, 12, 14) # overlaps <- 50 # clusts <- 6 # graph_grid <- list() # i <- 0 # for (ii in seq_along(intervals_seq)){ # # for (p in seq_along(overlaps_seq)){ # # for (b in seq_along(clust_bins)){ # i <- i + 1 # print(ii) # intervals <- intervals_seq[ii] # # overlaps <- overlaps_seq[p] # # clusts <- clust_bins[b] # # f_sim_map <- map_tda(lab_values_mat, # num_intervals = c(intervals, intervals), # percent_overlap = overlaps, # num_bins_when_clustering = clusts) # f_graph <- igraph::graph.adjacency(f_sim_map$adjacency, mode = "undirected") # # graph_grid[[i]] <- c(intervals = intervals, overlaps = overlaps, # clust_bins = clusts, # get_graph_properties(f_graph, f_sim_map)) # # } # # } # } # graph_grid <- do.call("rbind", graph_grid) # graph_grid %>% # as.data.frame() %>% # filter(n_comps == 1, clust_bins <= 10) %>% # distinct(intervals, n_comps, n_nodes, median_degree, edge_density, clique_length, diameter, mean_distance, .keep_all = T)
We chose the hyperparameters to be 6 to be the number of intervals for TDAMapper.
# graph_grid %>% # # as.data.frame() %>% # filter(intervals == 6)
f_sim_map <- map_tda(lab_values_mat, num_intervals = c(n_intervals, n_intervals), percent_overlap = p_overlaps, num_bins_when_clustering = n_clusts) f_graph <- make_tda_graph( f_sim_map, data = processed_data, enrich_var = 'time', # enrich topology by time for now color_method = 'clust_color', my_colors = my_colors )
and compute trajectories and assign observations to nodes in network
Check the MST plot and TIME boxplots
out_trajectories <- find_trajectories(processed_data, f_sim_map, f_graph) out_list <- compute_similarity(processed_data, f_graph$node_color, out_trajectories, f_sim_map)
similarity_df <- out_list[[1]] id_node_cluster <- out_list[[2]] most_similar_traj <- similarity_df %>% group_by(covid_id) %>% slice(which.max(SJ)) # use Jaccard similarity head(most_similar_traj, 10) centroids <- processed_data[, c('id', lab_value_names)] %>% left_join(id_node_cluster[, c('id', 'node')], by = 'id') %>% group_by(node) %>% summarise(across(.fns = median), .groups = 'drop') %>% select(-id) node_color <- f_graph$node_color # save(centroids, node_color, out_trajectories, file = '../data/centroids.rda')
data_out <- most_similar_traj %>% select(covid_id, clusterTraj) table(data_out$clusterTraj)
plot_dat <- processed_data %>% left_join(id_node_cluster %>% distinct(covid_id, id, cluster), by = c('id', 'covid_id')) plot_dat %>% ggplot(aes(x = cluster, y = time, fill = cluster)) + geom_boxplot(alpha = 0.8) + scale_fill_manual(values = f_graph$pal$color) + scale_color_manual(values = f_graph$pal$color) + theme(legend.position = "none", plot.title = element_text(size = 8, hjust = 0.5)) plot_dat %>% select(cluster, all_of(lab_value_names)) %>% pivot_longer(- cluster, names_to = 'Lab', values_to = 'lab_value') %>% ggplot(aes(x = cluster, y = lab_value, fill = cluster)) + geom_boxplot(alpha = 0.8) + labs(x = NULL, y = NULL) + scale_fill_manual(values = f_graph$pal$color) + scale_color_manual(values = f_graph$pal$color) + theme(legend.position = "none") + facet_wrap(~ Lab, scales = 'free_y') processed_data_traj <- processed_data %>% left_join(most_similar_traj, by = c("covid_id")) %>% mutate(clusterTraj = as.factor(clusterTraj), time) %>% select(time, clusterTraj, all_of(lab_value_names)) %>% distinct()
processed_data_traj %>% pivot_longer(- c(time, clusterTraj), names_to = 'Lab', values_to = 'lab_value') %>% ggplot(aes(time, lab_value, colour = clusterTraj, group = clusterTraj, fill = clusterTraj)) + geom_smooth(method = "loess") + theme(legend.position = "none") + facet_wrap(~ Lab, scales = 'free_y')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.