In trang1618/tdapseudotime: Temporal Phenotyping via Topological Data Analysis and Pseudo Time

If your site is one of the "train" sites, please use this workflow to establish the topology and trajectories using the observations at your site.

Setting up

library(tdapseudotime)
library(mice)
library(dplyr)
library(tidyr)
library(ggplot2)

set.seed(1234)

theme_set(
  theme_bw() + 
    theme(
      panel.grid.minor = element_blank(),
      legend.title = element_blank()
    ))

Preprocess

Parameters:

my_file <- 'train-data.csv'

# TDAMapper parameters (see grid search below)
n_intervals <- 6
p_overlaps <- 60
n_clusts <- 8

Color palette for enrichment: blue > green > yellow > orange > red

# RColorBrewer::brewer.pal(length(unique(CommunityCluster$membership)), "Set1")
my_colors <- c("#00A3DD", "#60C659", "#FFBC21", "#FF7F1E", "#EF2B2D")

Read in data file

Create a timeline for each subject (being time zero the first observation). Extract lab values to be used for TDA distance matrix.

Check format: - first col: row id "id" (can be any value, i.e. unique row number) - second col: pts id "covid_id" - third col: date "day"

Need to check whether rowid is important

FupData <- read.csv(my_file, header = TRUE) %>% 
  widen_i2b2()

non_lab_value_names <- c('id', 'covid_id', 'CardiacTroponinHighSensitivity')
lab_value_names <- setdiff(names(FupData), non_lab_value_names)
processed_data <- mutate_at(FupData, dplyr::all_of(lab_value_names), as.numeric)
dim(processed_data)

mm <- 5
lab_values_mat <- processed_data[, lab_value_names] %>% 
  mice(m = mm, maxit = 25, meth = 'pmm', seed = 500) %>% # imputation
  # complete(1) %>%
  # scale() %>% # prepare for cosine similarity calculation
  {.}

lab_values_mat <- lapply(1:mm, function(x) complete(lab_values_mat, x)) %>%
  do.call(rbind, .) %>%
  as.matrix() %>%
  scale()

Perform grid search to select a combination of hyperparameters

The code in this section is optional.

# intervals_seq <- seq(3, 11, 1)
# overlaps <- 60
# clusts <- 8
# graph_grid <- list()
# 
# for (ii in seq_along(intervals_seq)){
#   print(ii)
#   intervals <- intervals_seq[ii]
#   f_sim_map <- map_tda(lab_values_mat,
#                        num_intervals = c(intervals, intervals),
#                        percent_overlap = overlaps,
#                        num_bins_when_clustering = clusts)
#   f_graph <- igraph::graph.adjacency(f_sim_map$adjacency, mode = "undirected")
#   
#   graph_grid[[ii]] <- c(intervals = intervals, overlaps = overlaps,
#                         clust_bins = clusts,
#                         get_graph_properties(f_graph, f_sim_map))
# }
# graph_grid <- do.call("rbind", graph_grid) %>% 
#   as.data.frame()
# 
# graph_grid %>% 
#   select(- overlaps, - clust_bins) %>% 
#   pivot_longer(-intervals) %>% 
#   ggplot(aes(x = intervals, y = value)) +
#   geom_point() +
#   scale_x_continuous(breaks = intervals_seq) +
#   facet_wrap(~ name, scales = 'free_y')

# intervals_seq <- seq(3, 11, 1)
# # overlaps_seq <- seq(50, 100, 10)
# # clust_bins <- c(6, 8, 10, 12, 14)
# overlaps <- 50
# clusts <- 6
# graph_grid <- list()
# i <- 0
# for (ii in seq_along(intervals_seq)){
#   # for (p in seq_along(overlaps_seq)){
#     # for (b in seq_along(clust_bins)){
#       i <- i + 1
#       print(ii)
#       intervals <- intervals_seq[ii]
#       # overlaps <- overlaps_seq[p]
#       # clusts <- clust_bins[b]
# 
#       f_sim_map <- map_tda(lab_values_mat,
#                      num_intervals = c(intervals, intervals),
#                      percent_overlap = overlaps,
#                      num_bins_when_clustering = clusts)
#       f_graph <- igraph::graph.adjacency(f_sim_map$adjacency, mode = "undirected")
# 
#       graph_grid[[i]] <- c(intervals = intervals, overlaps = overlaps,
#                            clust_bins = clusts,
#                            get_graph_properties(f_graph, f_sim_map))
#     # }
#   # }
# }
# graph_grid <- do.call("rbind", graph_grid) 
# graph_grid %>% 
#   as.data.frame() %>% 
#   filter(n_comps == 1, clust_bins <= 10) %>% 
#   distinct(intervals, n_comps, n_nodes, median_degree, edge_density, clique_length, diameter, mean_distance, .keep_all = T)

We chose the hyperparameters to be 6 to be the number of intervals for TDAMapper.

# graph_grid %>% 
#   # as.data.frame() %>% 
#   filter(intervals == 6)

Run TDA Mapper

f_sim_map <- map_tda(lab_values_mat,
                     num_intervals = c(n_intervals, n_intervals),
                     percent_overlap = p_overlaps,
                     num_bins_when_clustering = n_clusts)
f_graph <- make_tda_graph(
  f_sim_map, 
  data = processed_data, 
  enrich_var = 'time', # enrich topology by time for now
  color_method = 'clust_color',
  my_colors = my_colors
)

Create a minimum spanning tree

and compute trajectories and assign observations to nodes in network

THIS STEP NEED MANUAL REVIEW

Check the MST plot and TIME boxplots

out_trajectories <- find_trajectories(processed_data, f_sim_map, f_graph)
out_list <- compute_similarity(processed_data, f_graph$node_color, out_trajectories, f_sim_map)

similarity_df <- out_list[[1]]
id_node_cluster <- out_list[[2]]
most_similar_traj <- similarity_df %>%
  group_by(covid_id) %>%
  slice(which.max(SJ)) # use Jaccard similarity

head(most_similar_traj, 10)

centroids <- processed_data[, c('id', lab_value_names)] %>% 
  left_join(id_node_cluster[, c('id', 'node')], by = 'id') %>% 
  group_by(node) %>% 
  summarise(across(.fns = median), .groups = 'drop') %>% 
  select(-id)

node_color <- f_graph$node_color
# save(centroids, node_color, out_trajectories, file = '../data/centroids.rda')

Write output

data_out <- most_similar_traj %>% select(covid_id, clusterTraj)
table(data_out$clusterTraj)

Visualizations

plot_dat <- processed_data %>% 
  left_join(id_node_cluster %>% distinct(covid_id, id, cluster), 
            by = c('id', 'covid_id'))

plot_dat %>% 
  ggplot(aes(x = cluster, y = time, fill = cluster)) +
  geom_boxplot(alpha = 0.8) + 
  scale_fill_manual(values = f_graph$pal$color) + 
  scale_color_manual(values = f_graph$pal$color) +
  theme(legend.position = "none",
        plot.title = element_text(size = 8, hjust = 0.5))

plot_dat %>% 
  select(cluster, all_of(lab_value_names)) %>% 
  pivot_longer(- cluster, names_to = 'Lab', values_to = 'lab_value') %>% 
  ggplot(aes(x = cluster, y = lab_value, fill = cluster)) +
  geom_boxplot(alpha = 0.8) + 
  labs(x = NULL, y = NULL) +
  scale_fill_manual(values = f_graph$pal$color) + 
  scale_color_manual(values = f_graph$pal$color) +
  theme(legend.position = "none") +
  facet_wrap(~ Lab, scales = 'free_y')

processed_data_traj <- processed_data %>% 
  left_join(most_similar_traj, by = c("covid_id")) %>% 
  mutate(clusterTraj = as.factor(clusterTraj), time) %>%
  select(time, clusterTraj, all_of(lab_value_names)) %>% 
  distinct()

processed_data_traj  %>% 
  pivot_longer(- c(time, clusterTraj), 
               names_to = 'Lab', values_to = 'lab_value') %>% 
  ggplot(aes(time, lab_value, colour = clusterTraj, 
             group = clusterTraj, fill = clusterTraj)) +
  geom_smooth(method = "loess") +
  theme(legend.position = "none") +
  facet_wrap(~ Lab, scales = 'free_y')