R/simulate_data.R

Defines functions simulate_ep_data

Documented in simulate_ep_data

#' Simulate data
#' @export
#' @import dplyr
#' @import data.table

simulate_ep_data <- function(seed = 1) {

    set.seed(seed)
    # 350 000 premiers cas de I26
    simulated_data <- data.frame(id_patient = 1:350000, diagnoses = "PE", procedures = as.character('')) %>%
      mutate(age = round(runif(dim(.)[1], min = 45, max = 75))) %>%
      # tirage au sort de 33% de séjours I26 dupliqués (465000 séjours au total)
      bind_rows(.,sample_frac(.,size = .33, replace = TRUE)) %>%
      # group_by(id_patient) %>% summarise(n=n()) %>% .$n %>% table() %>%
      # ajout de 50 000 critères d'exclusion annuels (probabilité de 7500/an)
      bind_rows(.,sample_n(mutate(., diagnoses = "EXCLUSION"),size = 50000, replace = TRUE)) %>%
      # Date as.Date("2013-12-12") - as.Date("2007-01-01") = 2537 jours
      mutate(admission = round(runif(dim(.)[1], min = 1, max = 2537))) %>%
      # Baseline risk of THR 1200/year = 8000 (+ 5/7)
      bind_rows(.,
                sample_n(., size = 11760, replace = FALSE) %>%
                  mutate(admission = round(runif(dim(.)[1], min = 1, max = 2537))) %>%
                  mutate(diagnoses = 'NODIAG', procedures = "THR")
      ) %>%
      # Risk of THR 3000 (+5/7) within 6 months before PE   
      bind_rows(.,
                filter(., diagnoses %in% 'PE') %>% 
                  sample_n(size = 4200, replace = FALSE) %>%
                  mutate(admission = admission - round(rexp(dim(.)[1])*40)) %>%
                  mutate(diagnoses = 'NODIAG', procedures = "THR")
      ) %>% 
      filter(admission > 0) %>%
      mutate(year = as.Date(admission, origin = "2007-01-01") %>% format("%Y") %>% as.numeric) %>%
      mutate(length_of_stay = 1) %>%
      mutate(id_stay = 1:dim(.)[1]) %>%
      as.data.table()

  return(simulated_data)
}
jomuller/ITCARES documentation built on May 19, 2019, 7:26 p.m.