knitr::opts_chunk$set( fig.path = "man/figures/" )
You can install the development version of tdapseudotime from GitHub with remotes:
# install.packages('remotes') # uncomment to install devtools remotes::install_github('trang1618/tdapseudotime')
library(tdapseudotime) library(mice) library(dplyr) library(tidyr) library(ggplot2) set.seed(1234)
theme_set( theme_bw() + theme( panel.grid.minor = element_blank(), legend.title = element_blank() ))
Parameters:
my_file <- 'MyDataSim.csv' # TDA hyperparameter grid intervals_seq <- seq(5, 10, 1) overlaps_seq <- seq(50, 60, 10) clust_bins <- c(6, 8, 10) ii <- 2 p <- 2 b <- 2 num_intervals <- c(intervals_seq[ii], intervals_seq[ii]) percent_overlap <- overlaps_seq[p] num_bins_when_clustering <- clust_bins[b]
Color palette for enrichment: blue > green > yellow > orange > red
# RColorBrewer::brewer.pal(length(unique(CommunityCluster$membership)), "Set1") my_colors <- c("#00A3DD", "#60C659", "#FFBC21", "#FF7F1E", "#EF2B2D")
Create a time line for each subject (being time zero the first observation). Extract lab values to be used for TDA distance matrix.
Check format: - first col > row id "id" (can be any value, i.e. unique row number) - second col > pts id "covid_id" - third col > date "day"
Need to check whether rowid is important
FupData <- read.csv(my_file, header = TRUE, colClasses = "character") non_lab_value_names <- c('id', 'covid_id', 'day') lab_value_names <- setdiff(names(FupData), non_lab_value_names) processed_data <- FupData %>% mutate(day = as.Date(day, format = '%Y-%m-%d')) %>% group_by(covid_id) %>% mutate(first_date = min(day)) %>% ungroup() %>% mutate( id = as.integer(id), time = as.numeric(day - first_date, units = 'days')) %>% arrange(covid_id, time) %>% mutate_at(dplyr::all_of(lab_value_names), as.numeric)
lab_values_mat <- processed_data[, lab_value_names] %>% mice::mice(m = 5, maxit = 50, meth = 'pmm', seed = 500) %>% # imputation complete(1) %>% scale() # prepare for cosine similarity calculation
# check: # > sum(processed_data$first_date != FupData$FirstDate) # [1] 0 # > sum(processed_data$time != FupData$time) # [1] 0 # > sum(diff(as.numeric(processed_data$id))!=1) # [1] 0 # > sum(diff(as.numeric(FupData$id))!=1) # [1] 0
Compute cosine similarity and principal and secondary SVD:
cosine_sim <- cosine_sim_func(lab_values_mat) trunc_svds <- RSpectra::svds(cosine_sim, k = 2, nu = 2, nv = 2) svd1 <- - trunc_svds$u[, 1] svd2 <- - trunc_svds$u[, 2]
Check cosine similarity values.
hist(apply(cosine_sim, 1, mean)) hist(apply(cosine_sim, 1, min))
Enrich topology by any variable. Use 'time' for now.
f_time <- processed_data %>% select(ID = id, val = time) # or val = CRP, etc. hist(as.numeric(f_time$val))
# takes about 70s to run # system.time(svd_list <- svd(cosine_sim)) # SVD1 <- svd_list$u[, 1] # SVD2 <- svd_list$u[, 2] # hist(SVD1) # hist(SVD2) # tolerance = 1^(-10) # sum(svd2 - SVD2 > tolerance) # sum(svd1 - SVD1 > tolerance)
f_sim_map <- TDAmapper::mapper2D( distance_matrix = cosine_sim, filter_values = list(svd1 , svd2), num_intervals = c(intervals_seq[ii], intervals_seq[ii]), percent_overlap = overlaps_seq[p], num_bins_when_clustering = clust_bins[b] ) f_graph <- make_graph(f_sim_map, f_time, 'clust_color', my_colors)
# Check colormap id and f_graph vertex to make sure they match # color_map <- color_vertex(f_sim_map, f_time) # V(f_graph)
Create a minimum spanning tree (MST).
minspantreeweights <- igraph::mst(f_graph, weights = f_graph$clusters$edge.betweenness) plot(minspantreeweights) legend( "topright", legend = f_graph$pal$cluster, col = f_graph$pal$color, fill = f_graph$pal$color, horiz = TRUE, cex = 0.4 )
Check the MST plot and TIME boxplots.
Compute trajectories and assign observations to nodes in network.
out_trajectories <- find_trajectories( minspantreeweights, processed_data, f_sim_map, f_graph) similarity_df <- out_trajectories[[1]] id_node_cluster <- out_trajectories[[2]]
most_similar_traj <- similarity_df %>% group_by(covid_id) %>% slice(which.max(JW)) %>% # use Jaccard similarity for now mutate(trajNumbManual = case_when( trajNumb %in% c(1, 4) ~ "R>B", trajNumb %in% c(2, 3) ~ "B", trajNumb %in% c(5, 6 ) ~ "G>P>B", trajNumb %in% c(7, 8 ) ~ "P>G>P>B", TRUE ~ "" ))
data_out <- most_similar_traj %>% select(covid_id, trajNumbManual) table(data_out$trajNumbManual)
plot_dat <- processed_data %>% left_join(id_node_cluster %>% distinct(covid_id, id, cluster), by = c('id', 'covid_id')) plot_dat %>% ggplot(aes(x = cluster, y = time, fill = cluster)) + geom_boxplot(alpha = 0.8) + scale_fill_manual(values = f_graph$pal$color) + scale_color_manual(values = f_graph$pal$color) + theme(legend.position = "none", plot.title = element_text(size = 8, hjust = 0.5)) plot_dat %>% select(cluster, lab_value_names) %>% pivot_longer(- cluster, names_to = 'Lab', values_to = 'lab_value') %>% ggplot(aes(x = cluster, y = lab_value, fill = cluster)) + geom_boxplot(alpha = 0.8) + labs(x = NULL, y = NULL) + scale_fill_manual(values = f_graph$pal$color) + scale_color_manual(values = f_graph$pal$color) + theme(legend.position = "none") + facet_wrap(~ Lab, scales = 'free_y') processed_data_traj <- processed_data %>% left_join(most_similar_traj, by = c("covid_id")) %>% mutate(trajNumbManual = as.factor(trajNumbManual), time) %>% select(time, trajNumbManual, lab_value_names) %>% distinct()
processed_data_traj %>% pivot_longer(- c(time, trajNumbManual), names_to = 'Lab', values_to = 'lab_value') %>% ggplot(aes(time, lab_value, colour = trajNumbManual, group = trajNumbManual, fill = trajNumbManual)) + geom_smooth(method = "loess") + theme(legend.position = "none") + facet_wrap(~ Lab, scales = 'free_y')
Please open an issue for questions related to tdapseudotime usage, bug reports or general inquiries.
Thank you very much for your support!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.