knitr::opts_chunk$set(echo = FALSE)
library(pmartRmems)
library(dplyr)
library(ggplot2)
library(lubridate)
data("ibd")

EDA

Time Component

time <- ibd$f_data %>% 
  select(patientnumber, collection_timestamp, timepoint, diagnosis_full) %>% 
  mutate(rowID = 1:nrow(.)) %>%
  select(rowID, everything())
time$collection_timestamp <- mdy(time$collection_timestamp)
# timedf <- tidyr::separate(time, collection_timestamp, sep = "/", into = c("month","day","year"))
timediff <- time %>% group_by(patientnumber) %>% summarise(timespan = (max(collection_timestamp) - min(collection_timestamp))[[1]])
timedf <- merge(time, timediff, by = "patientnumber") %>% arrange(timespan)
ggplot(timedf) + geom_bar(aes(as.factor(timepoint))) + 
  labs(x = "Number of Visits", title = "Number of Visits Completed by Each Patient")
plotdat <- timedf
plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber))

ggplot(plotdat) + 
  geom_line(aes(collection_timestamp, newID, group = newID)) + 
  geom_point(aes(collection_timestamp, newID, group = newID)) +
  coord_cartesian(xlim = c(mdy(01012009), mdy(01012012))) + 
  theme(axis.text.y = element_blank()) +
  labs(title = "Study Duration by Subject", subtitle = "Ordered by Length of Study")
plotdat <- timedf %>% arrange(collection_timestamp)
plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber))

ggplot(plotdat) + 
  geom_line(aes(collection_timestamp, newID, group = newID)) + 
  geom_point(aes(collection_timestamp, newID, group = newID)) +
  coord_cartesian(xlim = c(mdy(01012009), mdy(01012012))) + 
  theme(axis.text.y = element_blank()) +
  labs(title = "Study Duration by Subject", subtitle = "Ordered by Beginning Date of Study")
# time between points vs number of time points
newdf <- timedf %>% arrange(patientnumber, collection_timestamp)
not_uniq <- unlist(lapply(unique(newdf$patientnumber), function(x) {
  duplicated(newdf$collection_timestamp[newdf$patientnumber == x])
  }))
time_uniq <- newdf[!not_uniq,]

timediff_by_patient <- function(timedata, patient) {
  dat <- timedata %>% filter(patientnumber == patient) %>% arrange(collection_timestamp)
  ts <- dat$collection_timestamp
  dat$lag <- as.numeric(ts - dplyr::lag(ts))
  dat$days_from_start <- as.numeric(ts - min(ts))

  return(dat)
}

templist <- lapply(unique(time_uniq$patientnumber), timediff_by_patient, timedata = time_uniq)
lagdata <- do.call(rbind, templist)
sumdata <- lagdata %>% group_by(patientnumber) %>% summarise(ntimes = n())
new_timedat <- merge(lagdata, sumdata, by = "patientnumber")
no_na_lag <- new_timedat %>% filter(!is.na(lag))
ggplot(no_na_lag) + 
  geom_point(aes(ntimes, lag, col = patientnumber)) + 
  coord_cartesian(ylim = c(0, 500))
plotdat <- new_timedat %>% arrange(timespan)
plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber))
ggplot(plotdat) + 
  geom_line(aes(days_from_start, newID, group = newID)) + 
  geom_point(aes(days_from_start, newID, group = newID)) +
  coord_cartesian(xlim = c(0, 1000)) +
  theme(axis.text.y = element_blank()) +
  labs(title = "Study Duration by Subject", subtitle = "Scaled from First Study")
plotdat <- new_timedat %>% arrange(timespan)
plotdat$newID <- factor(plotdat$patientnumber, levels = unique(plotdat$patientnumber))
ggplot(plotdat) + 
  geom_line(aes(days_from_start, newID, group = newID, col = diagnosis_full)) + 
  geom_point(aes(days_from_start, newID, group = newID, col = diagnosis_full)) +
  coord_cartesian(xlim = c(0, 1000)) +
  theme(axis.text.y = element_blank()) +
  labs(title = "Study Duration by Subject", subtitle = "Colored by Diagnosis")
plotdat <- no_na_lag %>% filter(timepoint < 10) %>% mutate(timepoint = as.factor(timepoint))
ggplot(plotdat) + geom_density(aes(lag, group = timepoint, fill = timepoint), alpha = 0.5) +
  coord_cartesian(xlim = c(0, 200)) +
  labs(title = "Time Between Points", x = "Days")

Data points per group

tempdat <- new_timedat %>% 
  filter(diagnosis_full %in% c("CD", "HC", "UC")) %>% 
  group_by(patientnumber, diagnosis_full) %>% 
  summarise(count = n())
dotdf <- tempdat %>% group_by(count, diagnosis_full) %>% summarise(dotsize = n())
plotdat <- merge(tempdat, dotdf) %>% mutate(num_datapoints = count * dotsize)

ggplot(plotdat) + 
  geom_point(aes(diagnosis_full, count, col = diagnosis_full, size = dotsize)) +
  annotate("text", x = plotdat$diagnosis_full, y = plotdat$count, label = plotdat$dotsize) +
  labs(x = "Diagnosis", y = "Number of Visits", title = "Patient Visits by Diagnosis Type", 
       subtitle = "Points labeled by patient count") +
  guides(col = "none", size = "none") +
  scale_y_continuous(breaks = 1:10) + 
  scale_size(breaks = 2*1:5, range = c(2, 10))
# patients
plotdat <- dotdf %>% group_by(diagnosis_full) %>% arrange(desc(count)) %>% 
  mutate(cum_patients = cumsum(dotsize)) #%>% 
  # ungroup() %>% rbind(list(1, "HC", 9, 9)) # only for visual purposes!

ggplot(plotdat) + geom_point(aes(count, cum_patients, 
                                 group = diagnosis_full, col = diagnosis_full), size = 3) +
  geom_line(aes(count, cum_patients, group = diagnosis_full, col = diagnosis_full)) +
  scale_x_continuous(breaks = 0:10) + 
  scale_y_continuous(breaks = 10*0:6) + 
  labs(x = "Number of Visits", y = "Patient Count", col = "Diagnosis", 
       title = "Patients left in study\nwhen filtered by number of visits") +
  theme_gray(base_size = 16)

# data points
plotdat <- dotdf %>% 
  mutate(num_datapoints = count * dotsize) %>%
  group_by(diagnosis_full) %>% 
  arrange(desc(count)) %>% 
  mutate(cum_datapoints = cumsum(num_datapoints)) #%>%
  # ungroup() %>% rbind(list(1, "HC", 3, 0, 60)) # only for visual purposes!

ggplot(plotdat) + geom_point(aes(count, cum_datapoints,
                                 group = diagnosis_full, col = diagnosis_full), size = 3) +
  geom_line(aes(count, cum_datapoints, group = diagnosis_full, col = diagnosis_full)) +
  scale_x_continuous(breaks = 0:10) +
  scale_y_continuous(breaks = seq(0, 350, by = 50)) +
  labs(x = "Number of Visits", y = "Patient Count", col = "Diagnosis",
       title = "Total data points in study\nwhen filtered by number of visits") +
  theme_gray(base_size = 16)

GLM

Metadata for Healthy Patients

fdata <- ibd$f_data
tmp <- fdata %>% filter(diagnosis_full == "HC", timepoint == 1) %>% 
  select(patientnumber, bmi, calprotectin, sex)
tmp$calprotectin <- as.numeric(tmp$calprotectin)
tmp$sex <- as.factor(tmp$sex)
summary(tmp[,-1])
# calprotectin = surrogate for inflamatory activity

Patients with more than one sample at a time point

dfdbl <- fdata %>% group_by(patientnumber, timepoint) %>% summarise(count =  n()) %>% 
  filter(count > 1)
# fdata %>% filter(patientnumber == 1, timepoint == 3)

choose_valid_sample <- function(patient, time) {
  samp_names <- subset(ibd$f_data, patientnumber == patient & timepoint == time)$sample_name
  dfreads <- ibd$e_data[, samp_names]
  csums <- colSums(dfreads)
  # valid_samp <- names(dfreads)[which.max(csums)]
  # return(list(valid_samp, csums, head(dfreads)))

  # tmp <- ibd$e_data[,names(csums)]
  # pnonmiss <- apply(tmp, 2, function(x) mean(x > 0))
  # return(list(csums, pnonmiss))
  return(csums)
}
read_sums <- lapply(1:nrow(dfdbl), function(i) choose_valid_sample(dfdbl[i,][[1]], dfdbl[i,][[2]]))
# names(read_sums) <- paste0("patient ", dfdbl[[1]], ", time ", dfdbl[[2]])
sums_mat <- as.data.frame(do.call(rbind, read_sums))
names(sums_mat) <- c("Sample1", "Sample2")
sums_mat <- cbind(as.matrix(dfdbl[,1:2]), sums_mat)
sums_mat$Ratio <- sums_mat$Sample1 / sums_mat$Sample2
sums_mat$Ratio[sums_mat$Ratio < 1] <- 1/sums_mat$Ratio[sums_mat$Ratio < 1]
sums_mat$Ratio <- round(sums_mat$Ratio)
sums_mat %>% arrange(Ratio)

Demographics of these odd patients

weird_patients <- sums_mat$patientnumber[sums_mat$Ratio < 10]
ibd$f_data %>% filter(patientnumber %in% weird_patients, !duplicated(patientnumber)) %>% 
  group_by(diagnosis_full, sex) %>% summarise(n = n())

Filter out patients that came less than 5 times

# newf <- fdata %>% filter(diagnosis_full %in% c("HC", "CD", "UC"), timepoint == 5)
# patients <- unique(newf$patientnumber)
# ibd2 <- ibd
# # dim(ibd2$f_data)
# filt <- fdata_filter(ibd2, cname = "patientnumber", level_keep = patients)
# ibd2 <- applyFilt(filt, ibd2)
# ibd2$f_data %>% group_by(diagnosis_full) %>% summarise(n = n())
filt <- fdata_filter(ibd, cname = "timepoint", level_keep = 1:5)
ibd2 <- applyFilt(filt, ibd)
filt <- fdata_filter(ibd2, cname = "diagnosis_full", level_keep = c("HC", "CD", "UC"))
ibd2 <- applyFilt(filt, ibd2)
fd <- ibd2$f_data

diagnosis

(dnosis <- table(fd$diagnosis_full))

timepoint

table(fd$timepoint)

sex of patients by diagnosis type

tmp <- fd %>% filter(!duplicated(patientnumber))
table(tmp$diagnosis_full, tmp$sex)

Look for outliers with PCoA

library(vegan)
library(ape)
# D <- vegdist(t(ibd2$e_data[,-1]))
# saveRDS(D, "data-raw/dist_mat_ibd.rds")
D <- readRDS("dist_mat_ibd.rds")
res <- pcoa(D)
rownames(res$vectors) <- NULL
biplot(res)

Sample 9 patients from CD and UC

nmale <- 3
nfemale <- 6

newfd <- fd %>% filter(timepoint == 5)

dfCD <- newfd %>% filter(diagnosis_full == "CD", !duplicated(patientnumber)) %>% 
  select(patientnumber, sex)
CDmale <- sample(dfCD$patientnumber[dfCD$sex == "male"], nmale)
CDfemale <- sample(dfCD$patientnumber[dfCD$sex == "female"], nfemale)

dfUC <- newfd %>% filter(diagnosis_full == "UC", !duplicated(patientnumber)) %>% 
  select(patientnumber, sex)
UCmale <- sample(dfUC$patientnumber[dfUC$sex == "male"], nmale)
UCfemale <- sample(dfUC$patientnumber[dfUC$sex == "female"], nfemale)

dfHC <- newfd %>% filter(diagnosis_full == "HC", !duplicated(patientnumber)) %>% 
  select(patientnumber, sex)
HCpatients <- dfHC$patientnumber

new_patients <- c(CDmale, CDfemale, UCmale, UCfemale, HCpatients)
Make sure gender is evenly distributed
ffilt <- fd %>% filter(patientnumber %in% new_patients)
ffilt %>% filter(!duplicated(patientnumber)) %>% group_by(diagnosis_full, sex) %>% summarise(n = n())
filt <- fdata_filter(ibd2, cname = "patientnumber", level_keep = new_patients)
ibd_down <- applyFilt(filt, ibd2)


pmartR/pmartRmems documentation built on May 29, 2019, 4:52 p.m.