inst/paper_eda/eda.R

## SKG
## Jan. 22, 2020
## some EDA (non-map kind)


devtools::load_all()
library(ggplot2)
library(dplyr)
library(tidyr)
library(viridis)
data(clusts_tb)


clusts_smear <- clusts_tb %>% rename(n_smear_pos = n_pos,
                                     n_smear_neg = n_neg,
                                     n_smear_unk = n_unk) %>%
  arrange(size, n_smear_neg, n_hiv_pos, n_hiv_neg) %>%
  pivot_longer(n_smear_pos:n_hiv_unk,
                  names_to = c("var", "type"),
               names_pattern = "n_?(.*)_(.*)")



ggplot(data = clusts_smear %>% filter(size > 1),
       aes(x = factor(PCR.Cluster, levels = unique(clusts_smear$PCR.Cluster)), y = value)) + geom_col(aes(fill = type)) +
  coord_flip() + theme_bw(base_size = 12) +
  scale_fill_manual(values = c("red", "blue", "gray30"), labels = c("-", "+", "Unknown")) +
  facet_wrap(~var, labeller = as_labeller(c('hiv' = "HIV", 'smear' = "Smear"))) +
  labs(title = "Cluster size and HIV/Smear status", x = "Cluster ID",
                          y = "Cluster Size",
                          fill = "Value",
       subtitle = "Cluster Size > 1") +
  theme(axis.text.y = element_blank(),
        legend.position = "bottom")

ggsave("hiv-smear.pdf", width = 7)


## Size and range
ggplot(data = clusts_tb %>% filter(size > 1), aes(x = size, y = as.numeric(inf_range) / 365)) + geom_point(size = 2) +
  theme_bw(base_size = 12) +
  labs(x = "Cluster Size",
       y = "Time between first and last detection (years)",
       title = "Infection Duration vs. Cluster Size",
       subtitle = "With a Loess Smoother") +
  geom_smooth()

ggsave("duration-size.pdf")



tb_clean$hiv <- ifelse(tb_clean$hivstatus == "Positive", "Positive",
                       ifelse(tb_clean$hivstatus == "Negative", "Negative",
                              "Unknown"))
tb_clean$smear <- ifelse(tb_clean$spsmear == "Positive", "Positive",
                       ifelse(tb_clean$spsmear == "Negative", "Negative",
                              "Unknown"))
## first infection smear
clusts_first <- tb_clean %>%
  filter(PCR.Cluster != "") %>%
  group_by(PCR.Cluster) %>%
  summarize(size = dplyr::n(),
            first_smear = smear[order(INIT_REGIMEN_START_DATE)][1],
            last_smear = smear[order(INIT_REGIMEN_START_DATE, decreasing = TRUE)][1])

tab <- table(clusts_first$first_smear, clusts_first$size)
tab2 <- table(clusts_first$last_smear, clusts_first$size)

df <- data.frame(tab)
df2 <- data.frame(tab2)

ggplot(data =  df2, aes(x = Var2, y = Freq)) + geom_col(aes(fill = Var1))


df3 <- tb_clean %>% filter(PCR.Cluster != "")
tab3 <- table( x = df3$hiv, y = df3$smear, dnn = c("HIV", "Smear"))
chisq.test(tab3, simulate.p.value = TRUE)

library(ggmosaic)

ggplot(data = tb_clean) + geom_mosaic(aes(x = product(smear, hiv), fill = smear))
library(vcd)
vcd::mosaic(x = tab3, legend = TRUE, shade = TRUE)
vcd::assoc(x = tab3, shade = TRUE)
skgallagher/TBornotTB documentation built on April 21, 2020, 1:19 p.m.