data-raw/DATASET.R

## code to prepare a fake dataset for turnr package
# this is meant to mimic raw input data for the package

library(tidyverse)

set.seed(123)


# TargetNames -------------------------------------------------------------


all_paths <- c('Human Metapneumovirus', 'PCR2 Control', 'RNA Process Control',
               'Coronavirus NL63', 'Influenza A H1-2009', 'PCR1 Control',
               'Human Rhinovirus/Enterovirus', 'Bocavirus', 'Influenza A H3',
               'Coronavirus 229E', 'Respiratory Syncytial Virus',
               'Coronavirus OC43', 'Adenovirus', 'Coronavirus HKU1',
               'Mycoplasma pneumoniae', 'Parainfluenza Virus 1',
               'Parainfluenza Virus 4', 'Parainfluenza Virus 3', 'Influenza B',
               'Bordetella pertussis', 'Chlamydophila pneumoniae',
               'Influenza A H1', 'Parainfluenza Virus 2',
               'Influenza A (no subtype detected)', 'Human RNA Control',
               'Coronavirus OC43 (RP)', 'Bordetella parapertussis (IS1001)',
               'Bordetella pertussis (ptxP)', 'Chlamydia pneumoniae')

controls <- all_paths[str_detect(all_paths, "[Cc]ontrol")]
rp_paths <- all_paths[!all_paths %in% controls]

neg_rate <- 0.4 # proportion negatives

GI_paths <- c('Aeromonas', 'Astrovirus', 'C. cayetanensis', 'Campylobacter',
              'EPEC',  'ETEC', 'G. lamblia', 'Norovirus', 'Salmonella',
              'E. coli O157', 'Adenovirus F', 'C. difficile', 'Cryptosporidium',
              'STEC', 'Vibrio', 'EAEC', 'Sapovirus', 'Shigella/EIEC',
              'Rotavirus A', 'V. cholerae', 'E. histolytica',
              'P. shigelloides', 'Y. enterocolitica')

# site info ---------------------------------------------------------------



SiteIDs <- stringi::stri_rand_strings(7, length = 10)
Serials <- stringi::stri_rand_strings(56, length = 10)

dates <- seq(from = lubridate::ymd("2017-01-01"),
             to = lubridate::ymd("2019-01-01"),
             by = 1)

panels <- c("Respiratory_Panel", "Gastro_Intestinal")


# fake site info
info1 <- tibble(Region = c("UT", "NY", "CA"),
                ZipCode = c(12345, 99999, 11111),
                Country = "United States of America")

info2 <- tibble(SiteID = SiteIDs,
                Region =  c("UT", "UT", "UT", "NY","NY", "NY", "CA")) %>%
    left_join(info1, by = "Region")

lookup_SiteID <- rep(SiteIDs, length(Serials)/length(SiteIDs))
names(lookup_SiteID) <- Serials

versions <- c("FA1.5", "FA2.0", "Torch")

info3 <- tibble(InstrumentSerialNumber = Serials,
                SiteID = lookup_SiteID[Serials],
                InstrumentVersion = sample(versions, length(Serials), replace = TRUE)) %>%
    left_join(info2, by = "SiteID")


# generating data based on fake TUR -----------------------------------

x <- 1:length(dates)

# expected TUR
y <- 7*sin(x/50) + 20
# plot(y~x, type = "l")

# generating values of TUR
TUR <- rpois(length(dates), y)
# plot(TUR ~ dates, type = "l")

df1 <- tibble(date = dates,
              TUR = TUR)

df2 <- df1 %>%
    group_by(date) %>%
    nest(TUR = TUR) %>%
    mutate(RunDataID = map(TUR, stringi::stri_rand_strings, length = 15)) %>%
    unnest(cols = c("TUR", "RunDataID")) %>%
    ungroup() %>%
    mutate(InstrumentSerialNumber = sample(Serials, size = nrow(.),
                                           replace = TRUE),
           # 70% of pouchtitles should be RP
           what_panel = rbinom(n = nrow(.), 1, 0.3),
           PouchTitle = panels[(what_panel + 1)]) %>%
    select(-what_panel)

stopifnot(sum(duplicated(df2$RunDataID)) == 0) # should all be unique

df3 <- df2  %>%
    group_by(date, RunDataID) %>%
    nest() %>%
    mutate(TargetName = map(data, function(x){
        # function input (x) not actually needed--just so map() runs
        out <- sample(controls, 2) # add two controls
        if (runif(1) < neg_rate) {
            return(out)
        } else {
            # shooting for abouta 5 % co-detection rate
            num_paths <- rbinom(1, 20, 0.05/20) +1 # number of pathogens
            out <- c(out, sample(rp_paths, num_paths))
        }
        out
    })) %>%
    unnest(cols = c("data", "TargetName")) %>%
    ungroup() %>%
    # switching non-rp tests to other fake pathogens
    mutate(TargetName = ifelse(PouchTitle != "Respiratory_Panel" &
                                   !TargetName %in% controls,
                               sample(GI_paths, nrow(.), replace = TRUE),
                               TargetName),
           ResultType = ifelse(TargetName %in% controls,
                               "control",
                               "organism"),
           AssayName = TargetName)

# adding duplicate rows (to help mimic multiple Assays)
df_org <- df3 %>%
    filter(ResultType == "organism")
extra_rows <- df_org[sample(1:nrow(df_org), size = nrow(df_org)/3, replace = FALSE),] %>%
    mutate(AssayName = paste(AssayName, "assay 2")) # fake second assay

df4 <- bind_rows(df3, extra_rows) %>%
    ungroup() %>%
    left_join(info3, by = "InstrumentSerialNumber") %>%
    select(-TUR) %>%
    mutate(StartTime = paste(as.character(date), "07:26:49.000"),
           FlaggedAsValidation = 0,
           TargetShortName = NA,
           TargetResult = ifelse(ResultType == "control",
                                 "Pass", "Positive"),
           # in actual data set there are some negative assay results
           AssayResult = "Positive") %>%
    select(-date)


# turnr::initial_check(df4)
rp_raw <- df4

# save for use in package
usethis::use_data(rp_raw, overwrite = TRUE)


# creating processed files ------------------------------------------------

# creating downstream files, so that examples don't have to run
# all the processing functions

# daily TUR
TUR_dat <- turnr::pre_process(turnr::rp_raw) %>%
    turnr::calc_active_instruments()

usethis::use_data(TUR_dat, overwrite = TRUE)

# count by pathogen/site
path_dat <- turnr::pre_process(rp_raw) %>%
    turnr::co_detection() %>%
    turnr::calc_count_by_site_inst()

usethis::use_data(path_dat, overwrite = TRUE)
MartinHoldrege/turnr documentation built on May 16, 2020, 10:39 a.m.