knitr::opts_chunk$set(echo = FALSE)
library(pmartRmems) library(dplyr) library(ggplot2) library(lubridate) data("ibd")
time <- ibd$f_data %>% select(patientnumber, collection_timestamp, timepoint, diagnosis_full) %>% mutate(rowID = 1:nrow(.)) %>% select(rowID, everything()) time$collection_timestamp <- mdy(time$collection_timestamp) # timedf <- tidyr::separate(time, collection_timestamp, sep = "/", into = c("month","day","year")) timediff <- time %>% group_by(patientnumber) %>% summarise(timespan = (max(collection_timestamp) - min(collection_timestamp))[[1]]) timedf <- merge(time, timediff, by = "patientnumber") %>% arrange(timespan)
ggplot(timedf) + geom_bar(aes(as.factor(timepoint))) + labs(x = "Number of Visits", title = "Number of Visits Completed by Each Patient")
plotdat <- timedf plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber)) ggplot(plotdat) + geom_line(aes(collection_timestamp, newID, group = newID)) + geom_point(aes(collection_timestamp, newID, group = newID)) + coord_cartesian(xlim = c(mdy(01012009), mdy(01012012))) + theme(axis.text.y = element_blank()) + labs(title = "Study Duration by Subject", subtitle = "Ordered by Length of Study")
plotdat <- timedf %>% arrange(collection_timestamp) plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber)) ggplot(plotdat) + geom_line(aes(collection_timestamp, newID, group = newID)) + geom_point(aes(collection_timestamp, newID, group = newID)) + coord_cartesian(xlim = c(mdy(01012009), mdy(01012012))) + theme(axis.text.y = element_blank()) + labs(title = "Study Duration by Subject", subtitle = "Ordered by Beginning Date of Study")
# time between points vs number of time points newdf <- timedf %>% arrange(patientnumber, collection_timestamp) not_uniq <- unlist(lapply(unique(newdf$patientnumber), function(x) { duplicated(newdf$collection_timestamp[newdf$patientnumber == x]) })) time_uniq <- newdf[!not_uniq,] timediff_by_patient <- function(timedata, patient) { dat <- timedata %>% filter(patientnumber == patient) %>% arrange(collection_timestamp) ts <- dat$collection_timestamp dat$lag <- as.numeric(ts - dplyr::lag(ts)) dat$days_from_start <- as.numeric(ts - min(ts)) return(dat) } templist <- lapply(unique(time_uniq$patientnumber), timediff_by_patient, timedata = time_uniq) lagdata <- do.call(rbind, templist) sumdata <- lagdata %>% group_by(patientnumber) %>% summarise(ntimes = n()) new_timedat <- merge(lagdata, sumdata, by = "patientnumber")
no_na_lag <- new_timedat %>% filter(!is.na(lag)) ggplot(no_na_lag) + geom_point(aes(ntimes, lag, col = patientnumber)) + coord_cartesian(ylim = c(0, 500))
plotdat <- new_timedat %>% arrange(timespan) plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber)) ggplot(plotdat) + geom_line(aes(days_from_start, newID, group = newID)) + geom_point(aes(days_from_start, newID, group = newID)) + coord_cartesian(xlim = c(0, 1000)) + theme(axis.text.y = element_blank()) + labs(title = "Study Duration by Subject", subtitle = "Scaled from First Study")
plotdat <- new_timedat %>% arrange(timespan) plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber)) ggplot(plotdat) + geom_line(aes(days_from_start, newID, group = newID, col = diagnosis_full)) + geom_point(aes(days_from_start, newID, group = newID, col = diagnosis_full)) + coord_cartesian(xlim = c(0, 1000)) + theme(axis.text.y = element_blank()) + labs(title = "Study Duration by Subject", subtitle = "Colored by Diagnosis")
plotdat <- no_na_lag %>% filter(timepoint < 10) %>% mutate(timepoint = as.factor(timepoint)) ggplot(plotdat) + geom_density(aes(lag, group = timepoint, fill = timepoint), alpha = 0.5) + coord_cartesian(xlim = c(0, 200)) + labs(title = "Time Between Points", x = "Days")
tempdat <- new_timedat %>% filter(diagnosis_full %in% c("CD", "HC", "UC")) %>% group_by(patientnumber, diagnosis_full) %>% summarise(count = n()) dotdf <- tempdat %>% group_by(count, diagnosis_full) %>% summarise(dotsize = n()) plotdat <- merge(tempdat, dotdf) %>% mutate(num_datapoints = count * dotsize) ggplot(plotdat) + geom_point(aes(diagnosis_full, count, col = diagnosis_full, size = dotsize)) + annotate("text", x = plotdat$diagnosis_full, y = plotdat$count, label = plotdat$dotsize) + labs(x = "Diagnosis", y = "Number of Visits", title = "Patient Visits by Diagnosis Type", subtitle = "Points labeled by patient count") + guides(col = "none", size = "none") + scale_y_continuous(breaks = 1:10) + scale_size(breaks = 2*1:5, range = c(2, 10))
# patients plotdat <- dotdf %>% group_by(diagnosis_full) %>% arrange(desc(count)) %>% mutate(cum_patients = cumsum(dotsize)) #%>% # ungroup() %>% rbind(list(1, "HC", 9, 9)) # only for visual purposes! ggplot(plotdat) + geom_point(aes(count, cum_patients, group = diagnosis_full, col = diagnosis_full), size = 3) + geom_line(aes(count, cum_patients, group = diagnosis_full, col = diagnosis_full)) + scale_x_continuous(breaks = 0:10) + scale_y_continuous(breaks = 10*0:6) + labs(x = "Number of Visits", y = "Patient Count", col = "Diagnosis", title = "Patients left in study\nwhen filtered by number of visits") + theme_gray(base_size = 16) # data points plotdat <- dotdf %>% mutate(num_datapoints = count * dotsize) %>% group_by(diagnosis_full) %>% arrange(desc(count)) %>% mutate(cum_datapoints = cumsum(num_datapoints)) #%>% # ungroup() %>% rbind(list(1, "HC", 3, 0, 60)) # only for visual purposes! ggplot(plotdat) + geom_point(aes(count, cum_datapoints, group = diagnosis_full, col = diagnosis_full), size = 3) + geom_line(aes(count, cum_datapoints, group = diagnosis_full, col = diagnosis_full)) + scale_x_continuous(breaks = 0:10) + scale_y_continuous(breaks = seq(0, 350, by = 50)) + labs(x = "Number of Visits", y = "Patient Count", col = "Diagnosis", title = "Total data points in study\nwhen filtered by number of visits") + theme_gray(base_size = 16)
fdata <- ibd$f_data tmp <- fdata %>% filter(diagnosis_full == "HC", timepoint == 1) %>% select(patientnumber, bmi, calprotectin, sex) tmp$calprotectin <- as.numeric(tmp$calprotectin) tmp$sex <- as.factor(tmp$sex) summary(tmp[,-1]) # calprotectin = surrogate for inflamatory activity
dfdbl <- fdata %>% group_by(patientnumber, timepoint) %>% summarise(count = n()) %>% filter(count > 1) # fdata %>% filter(patientnumber == 1, timepoint == 3) choose_valid_sample <- function(patient, time) { samp_names <- subset(ibd$f_data, patientnumber == patient & timepoint == time)$sample_name dfreads <- ibd$e_data[, samp_names] csums <- colSums(dfreads) # valid_samp <- names(dfreads)[which.max(csums)] # return(list(valid_samp, csums, head(dfreads))) # tmp <- ibd$e_data[,names(csums)] # pnonmiss <- apply(tmp, 2, function(x) mean(x > 0)) # return(list(csums, pnonmiss)) return(csums) } read_sums <- lapply(1:nrow(dfdbl), function(i) choose_valid_sample(dfdbl[i,][[1]], dfdbl[i,][[2]])) # names(read_sums) <- paste0("patient ", dfdbl[[1]], ", time ", dfdbl[[2]]) sums_mat <- as.data.frame(do.call(rbind, read_sums)) names(sums_mat) <- c("Sample1", "Sample2") sums_mat <- cbind(as.matrix(dfdbl[,1:2]), sums_mat) sums_mat$Ratio <- sums_mat$Sample1 / sums_mat$Sample2 sums_mat$Ratio[sums_mat$Ratio < 1] <- 1/sums_mat$Ratio[sums_mat$Ratio < 1] sums_mat$Ratio <- round(sums_mat$Ratio) sums_mat %>% arrange(Ratio)
Demographics of these odd patients
weird_patients <- sums_mat$patientnumber[sums_mat$Ratio < 10] ibd$f_data %>% filter(patientnumber %in% weird_patients, !duplicated(patientnumber)) %>% group_by(diagnosis_full, sex) %>% summarise(n = n())
# newf <- fdata %>% filter(diagnosis_full %in% c("HC", "CD", "UC"), timepoint == 5) # patients <- unique(newf$patientnumber) # ibd2 <- ibd # # dim(ibd2$f_data) # filt <- fdata_filter(ibd2, cname = "patientnumber", level_keep = patients) # ibd2 <- applyFilt(filt, ibd2) # ibd2$f_data %>% group_by(diagnosis_full) %>% summarise(n = n()) filt <- fdata_filter(ibd, cname = "timepoint", level_keep = 1:5) ibd2 <- applyFilt(filt, ibd) filt <- fdata_filter(ibd2, cname = "diagnosis_full", level_keep = c("HC", "CD", "UC")) ibd2 <- applyFilt(filt, ibd2) fd <- ibd2$f_data
(dnosis <- table(fd$diagnosis_full))
table(fd$timepoint)
tmp <- fd %>% filter(!duplicated(patientnumber)) table(tmp$diagnosis_full, tmp$sex)
library(vegan) library(ape) # D <- vegdist(t(ibd2$e_data[,-1])) # saveRDS(D, "data-raw/dist_mat_ibd.rds") D <- readRDS("dist_mat_ibd.rds") res <- pcoa(D) rownames(res$vectors) <- NULL biplot(res)
nmale <- 3 nfemale <- 6 newfd <- fd %>% filter(timepoint == 5) dfCD <- newfd %>% filter(diagnosis_full == "CD", !duplicated(patientnumber)) %>% select(patientnumber, sex) CDmale <- sample(dfCD$patientnumber[dfCD$sex == "male"], nmale) CDfemale <- sample(dfCD$patientnumber[dfCD$sex == "female"], nfemale) dfUC <- newfd %>% filter(diagnosis_full == "UC", !duplicated(patientnumber)) %>% select(patientnumber, sex) UCmale <- sample(dfUC$patientnumber[dfUC$sex == "male"], nmale) UCfemale <- sample(dfUC$patientnumber[dfUC$sex == "female"], nfemale) dfHC <- newfd %>% filter(diagnosis_full == "HC", !duplicated(patientnumber)) %>% select(patientnumber, sex) HCpatients <- dfHC$patientnumber new_patients <- c(CDmale, CDfemale, UCmale, UCfemale, HCpatients)
ffilt <- fd %>% filter(patientnumber %in% new_patients) ffilt %>% filter(!duplicated(patientnumber)) %>% group_by(diagnosis_full, sex) %>% summarise(n = n())
filt <- fdata_filter(ibd2, cname = "patientnumber", level_keep = new_patients) ibd_down <- applyFilt(filt, ibd2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.