knitr::opts_chunk$set(
  fig.path = "man/figures/"
)

You can install the development version of tdapseudotime from GitHub with remotes:

# install.packages('remotes') # uncomment to install devtools
remotes::install_github('trang1618/tdapseudotime')
library(tdapseudotime)
library(mice)
library(dplyr)
library(tidyr)
library(ggplot2)

set.seed(1234)
theme_set(
  theme_bw() + 
    theme(
      panel.grid.minor = element_blank(),
      legend.title = element_blank()
    ))

Preprocess

Parameters:

my_file <- 'MyDataSim.csv'

# TDA hyperparameter grid
intervals_seq <- seq(5, 10, 1)
overlaps_seq <- seq(50, 60, 10)
clust_bins <- c(6, 8, 10)
ii <- 2
p <- 2
b <- 2
num_intervals <- c(intervals_seq[ii], intervals_seq[ii])
percent_overlap <- overlaps_seq[p]
num_bins_when_clustering <- clust_bins[b]

Color palette for enrichment: blue > green > yellow > orange > red

# RColorBrewer::brewer.pal(length(unique(CommunityCluster$membership)), "Set1")
my_colors <- c("#00A3DD", "#60C659", "#FFBC21", "#FF7F1E", "#EF2B2D")

Read in data file

Create a time line for each subject (being time zero the first observation). Extract lab values to be used for TDA distance matrix.

Check format: - first col > row id "id" (can be any value, i.e. unique row number) - second col > pts id "covid_id" - third col > date "day"

Need to check whether rowid is important

FupData <- read.csv(my_file, header = TRUE, colClasses = "character")
non_lab_value_names <- c('id', 'covid_id', 'day')
lab_value_names <- setdiff(names(FupData), non_lab_value_names)

processed_data <- FupData %>% 
  mutate(day = as.Date(day, format = '%Y-%m-%d')) %>% 
  group_by(covid_id) %>% 
  mutate(first_date = min(day)) %>% 
  ungroup() %>% 
  mutate(
    id = as.integer(id),
    time = as.numeric(day - first_date, units = 'days')) %>% 
  arrange(covid_id, time) %>% 
  mutate_at(dplyr::all_of(lab_value_names), as.numeric)
lab_values_mat <- processed_data[, lab_value_names] %>% 
  mice::mice(m = 5, maxit = 50, meth = 'pmm', seed = 500) %>% # imputation
  complete(1) %>% 
  scale() # prepare for cosine similarity calculation
# check:
# > sum(processed_data$first_date != FupData$FirstDate)
# [1] 0
# > sum(processed_data$time != FupData$time)
# [1] 0
# > sum(diff(as.numeric(processed_data$id))!=1)
# [1] 0
# > sum(diff(as.numeric(FupData$id))!=1)
# [1] 0

Preparing for running TDA Mapper

Compute cosine similarity and principal and secondary SVD:

cosine_sim <- cosine_sim_func(lab_values_mat)
trunc_svds <- RSpectra::svds(cosine_sim, k = 2, nu = 2, nv = 2)
svd1 <- - trunc_svds$u[, 1]
svd2 <- - trunc_svds$u[, 2]

Check cosine similarity values.

hist(apply(cosine_sim, 1, mean))
hist(apply(cosine_sim, 1, min))

Enrich topology by any variable. Use 'time' for now.

f_time <- processed_data %>% 
  select(ID = id, val = time) # or val = CRP, etc.
hist(as.numeric(f_time$val))
# takes about 70s to run
# system.time(svd_list <- svd(cosine_sim))
# SVD1 <- svd_list$u[, 1]
# SVD2 <- svd_list$u[, 2]
# hist(SVD1)
# hist(SVD2)
# tolerance = 1^(-10)
# sum(svd2 - SVD2 > tolerance)
# sum(svd1 - SVD1 > tolerance)

Run function TDA Mapper

f_sim_map <- TDAmapper::mapper2D(
  distance_matrix = cosine_sim,
  filter_values = list(svd1 , svd2),
  num_intervals = c(intervals_seq[ii], intervals_seq[ii]),
  percent_overlap = overlaps_seq[p],
  num_bins_when_clustering = clust_bins[b]
)

f_graph <- make_graph(f_sim_map, f_time, 'clust_color', my_colors)
# Check colormap id and f_graph vertex to make sure they match
# color_map <- color_vertex(f_sim_map, f_time)
# V(f_graph)

Create a minimum spanning tree (MST).

minspantreeweights <- igraph::mst(f_graph, weights = f_graph$clusters$edge.betweenness)
plot(minspantreeweights)
legend(
  "topright",
  legend = f_graph$pal$cluster,
  col = f_graph$pal$color,
  fill = f_graph$pal$color,
  horiz = TRUE,
  cex = 0.4
)

THIS STEP NEED MANUAL REVIEW

Check the MST plot and TIME boxplots.

Compute trajectories and assign observations to nodes in network.

out_trajectories <- find_trajectories(
  minspantreeweights, processed_data, f_sim_map, f_graph)
similarity_df <- out_trajectories[[1]]
id_node_cluster <- out_trajectories[[2]]
most_similar_traj <- similarity_df %>%
  group_by(covid_id) %>%
  slice(which.max(JW)) %>% # use Jaccard similarity for now
  mutate(trajNumbManual = case_when(
    trajNumb %in% c(1, 4) ~ "R>B",
    trajNumb %in% c(2, 3) ~ "B",
    trajNumb %in% c(5, 6 ) ~ "G>P>B",
    trajNumb %in% c(7, 8 ) ~ "P>G>P>B",
    TRUE ~ ""
  ))

Write output

data_out <- most_similar_traj %>% select(covid_id, trajNumbManual)
table(data_out$trajNumbManual)

Visualizations

plot_dat <- processed_data %>% 
  left_join(id_node_cluster %>% distinct(covid_id, id, cluster), 
            by = c('id', 'covid_id'))

plot_dat %>% 
  ggplot(aes(x = cluster, y = time, fill = cluster)) +
  geom_boxplot(alpha = 0.8) + 
  scale_fill_manual(values = f_graph$pal$color) + 
  scale_color_manual(values = f_graph$pal$color) +
  theme(legend.position = "none",
        plot.title = element_text(size = 8, hjust = 0.5))

plot_dat %>% 
  select(cluster, lab_value_names) %>% 
  pivot_longer(- cluster, names_to = 'Lab', values_to = 'lab_value') %>% 
  ggplot(aes(x = cluster, y = lab_value, fill = cluster)) +
  geom_boxplot(alpha = 0.8) + 
  labs(x = NULL, y = NULL) +
  scale_fill_manual(values = f_graph$pal$color) + 
  scale_color_manual(values = f_graph$pal$color) +
  theme(legend.position = "none") +
  facet_wrap(~ Lab, scales = 'free_y')

processed_data_traj <- processed_data %>% 
  left_join(most_similar_traj, by = c("covid_id")) %>% 
  mutate(trajNumbManual = as.factor(trajNumbManual), time) %>%
  select(time, trajNumbManual, lab_value_names) %>% 
  distinct()
processed_data_traj  %>% 
  pivot_longer(- c(time, trajNumbManual), 
                      names_to = 'Lab', values_to = 'lab_value') %>% 
  ggplot(aes(time, lab_value, colour = trajNumbManual, 
             group = trajNumbManual, fill = trajNumbManual)) +
  geom_smooth(method = "loess") +
  theme(legend.position = "none") +
  facet_wrap(~ Lab, scales = 'free_y')

Please open an issue for questions related to tdapseudotime usage, bug reports or general inquiries.

Thank you very much for your support!



covidclinical/Phase2.1TDAPseudotimeRPackage documentation built on Sept. 27, 2020, 12:03 a.m.