You can install the development version of tdapseudotime from GitHub with remotes:
# install.packages('remotes') # uncomment to install devtools
remotes::install_github('trang1618/tdapseudotime')
library(tdapseudotime)
library(mice)
library(dplyr)
library(tidyr)
library(ggplot2)
set.seed(1234)
Parameters:
my_file <- 'MyDataSim.csv'
# TDA hyperparameter grid
intervals_seq <- seq(5, 10, 1)
overlaps_seq <- seq(50, 60, 10)
clust_bins <- c(6, 8, 10)
ii <- 2
p <- 2
b <- 2
num_intervals <- c(intervals_seq[ii], intervals_seq[ii])
percent_overlap <- overlaps_seq[p]
num_bins_when_clustering <- clust_bins[b]
Color palette for enrichment: blue > green > yellow > orange > red
# RColorBrewer::brewer.pal(length(unique(CommunityCluster$membership)), "Set1")
my_colors <- c("#00A3DD", "#60C659", "#FFBC21", "#FF7F1E", "#EF2B2D")
Create a time line for each subject (being time zero the first observation). Extract lab values to be used for TDA distance matrix.
Check format: - first col > row id “id” (can be any value, i.e. unique row number) - second col > pts id “covid_id” - third col > date “day”
Need to check whether rowid is important
FupData <- read.csv(my_file, header = TRUE, colClasses = "character")
non_lab_value_names <- c('id', 'covid_id', 'day')
lab_value_names <- setdiff(names(FupData), non_lab_value_names)
processed_data <- FupData %>%
mutate(day = as.Date(day, format = '%Y-%m-%d')) %>%
group_by(covid_id) %>%
mutate(first_date = min(day)) %>%
ungroup() %>%
mutate(
id = as.integer(id),
time = as.numeric(day - first_date, units = 'days')) %>%
arrange(covid_id, time) %>%
mutate_at(dplyr::all_of(lab_value_names), as.numeric)
lab_values_mat <- processed_data[, lab_value_names] %>%
mice::mice(m = 5, maxit = 50, meth = 'pmm', seed = 500) %>% # imputation
complete(1) %>%
scale() # prepare for cosine similarity calculation
Compute cosine similarity and principal and secondary SVD:
cosine_sim <- cosine_sim_func(lab_values_mat)
trunc_svds <- RSpectra::svds(cosine_sim, k = 2, nu = 2, nv = 2)
svd1 <- - trunc_svds$u[, 1]
svd2 <- - trunc_svds$u[, 2]
Check cosine similarity values.
hist(apply(cosine_sim, 1, mean))
hist(apply(cosine_sim, 1, min))
Enrich topology by any variable. Use ‘time’ for now.
f_time <- processed_data %>%
select(ID = id, val = time) # or val = CRP, etc.
hist(as.numeric(f_time$val))
f_sim_map <- TDAmapper::mapper2D(
distance_matrix = cosine_sim,
filter_values = list(svd1 , svd2),
num_intervals = c(intervals_seq[ii], intervals_seq[ii]),
percent_overlap = overlaps_seq[p],
num_bins_when_clustering = clust_bins[b]
)
f_graph <- make_graph(f_sim_map, f_time, 'clust_color', my_colors)
Create a minimum spanning tree (MST).
minspantreeweights <- igraph::mst(f_graph, weights = f_graph$clusters$edge.betweenness)
plot(minspantreeweights)
legend(
"topright",
legend = f_graph$pal$cluster,
col = f_graph$pal$color,
fill = f_graph$pal$color,
horiz = TRUE,
cex = 0.4
)
Check the MST plot and TIME boxplots.
Compute trajectories and assign observations to nodes in network.
out_trajectories <- find_trajectories(
minspantreeweights, processed_data, f_sim_map, f_graph)
## trajElmnts clusterTraj
## 1 3 2 1 7 13 14 15 16 17 18 1>2
## 2 4 5 11 12 18 2
## 3 6 12 18 2
## 4 8 9 10 16 17 18 1>2
## 5 20 19 25 31 32 33 34 35 36 30 24 18 3>4>2
## 6 21 27 26 25 31 32 33 34 35 36 30 24 18 3>4>2
## 7 23 22 28 27 26 25 31 32 33 34 35 36 30 24 18 4>3>4>2
## 8 29 28 27 26 25 31 32 33 34 35 36 30 24 18 4>3>4>2
similarity_df <- out_trajectories[[1]]
id_node_cluster <- out_trajectories[[2]]
most_similar_traj <- similarity_df %>%
group_by(covid_id) %>%
slice(which.max(JW)) %>% # use Jaccard similarity for now
mutate(trajNumbManual = case_when(
trajNumb %in% c(1, 4) ~ "R>B",
trajNumb %in% c(2, 3) ~ "B",
trajNumb %in% c(5, 6 ) ~ "G>P>B",
trajNumb %in% c(7, 8 ) ~ "P>G>P>B",
TRUE ~ ""
))
data_out <- most_similar_traj %>% select(covid_id, trajNumbManual)
table(data_out$trajNumbManual)
##
## B G>P>B P>G>P>B R>B
## 33 110 310 95
plot_dat <- processed_data %>%
left_join(id_node_cluster %>% distinct(covid_id, id, cluster),
by = c('id', 'covid_id'))
plot_dat %>%
ggplot(aes(x = cluster, y = time, fill = cluster)) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = f_graph$pal$color) +
scale_color_manual(values = f_graph$pal$color) +
theme(legend.position = "none",
plot.title = element_text(size = 8, hjust = 0.5))
plot_dat %>%
select(cluster, lab_value_names) %>%
pivot_longer(- cluster, names_to = 'Lab', values_to = 'lab_value') %>%
ggplot(aes(x = cluster, y = lab_value, fill = cluster)) +
geom_boxplot(alpha = 0.8) +
labs(x = NULL, y = NULL) +
scale_fill_manual(values = f_graph$pal$color) +
scale_color_manual(values = f_graph$pal$color) +
theme(legend.position = "none") +
facet_wrap(~ Lab, scales = 'free_y')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(lab_value_names)` instead of `lab_value_names` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
processed_data_traj <- processed_data %>%
left_join(most_similar_traj, by = c("covid_id")) %>%
mutate(trajNumbManual = as.factor(trajNumbManual), time) %>%
select(time, trajNumbManual, lab_value_names) %>%
distinct()
processed_data_traj %>%
pivot_longer(- c(time, trajNumbManual),
names_to = 'Lab', values_to = 'lab_value') %>%
ggplot(aes(time, lab_value, colour = trajNumbManual,
group = trajNumbManual, fill = trajNumbManual)) +
geom_smooth(method = "loess") +
theme(legend.position = "none") +
facet_wrap(~ Lab, scales = 'free_y')
## `geom_smooth()` using formula 'y ~ x'
Please open an issue for questions related to tdapseudotime usage, bug reports or general inquiries.
Thank you very much for your support!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.