dumb_code/simulate_data_no_dplyr.R

#' Simulate data
#' @export
#' @import dplyr
#' @import data.table

simulate_ep_data_no_dplyr <- function(
  seed = 1,
  n_patients = 350000,
  age_min = 45, 
  age_max = 75,
  n_exclusion_year = 7500,
  admission_year_min = 2007, 
  admission_year_max = 2013
) {
  
  set.seed(seed)
  
  # 350 000 premiers cas de I26
  
  
  simulated_data <- data.frame(
    id_patient = seq_len(n_patients), 
    diagnoses = "PE", 
    procedures = as.character(''),
    age = round(runif(n_patients, min = age_min, max = age_max)),
    stringsAsFactors = FALSE
  )
  
  # tirage au sort de 33% de séjours I26 dupliqués (465000 séjours au total)
  patients_to_duplicate <- sample(
    x = seq_len(n_patients),
    size = round(n_patients/3),
    replace = TRUE
  )
  
  simulated_data <- 
    simulated_data[c(seq_len(n_patients), patients_to_duplicate),]
  
  n_year <- admission_year_max - admission_year_min + 1
  n_exclusion <- n_exclusion_year * n_year
  # ajout de 50 000 critères d'exclusion annuels (probabilité de 7500/an)
  ## bind_rows(.,sample_n(mutate(., diagnoses = "EXCLUSION"),size = 50000, replace = TRUE)) %>%
  patients_excluded <- sample(
    x = seq_len(nrow(simulated_data)),
    size = n_exclusion,
    replace = TRUE
  )
  
  df_excluded <- simulated_data[patients_excluded, ]
  df_excluded$diagnoses <- "EXCLUSION"
  
  simulated_data <- rbind(simulated_data, df_excluded)
  # Add date of admission
    # Date as.Date("2013-12-12") - as.Date("2007-01-01") = 2537 jours
  start_date <- as.Date(paste0(admission_year_min, "-01-01"))
  n_days <- as.integer(as.Date(paste0(admission_year_max, "-12-12")) - 
                      start_date)
  simulated_data$admission <- round(runif(nrow(simulated_data), min = 1, max = n_days))
  
  
    # Baseline risk of THR 1200/year = 8000 (+ 5/7)
  ##bind_rows(.,
  ##          sample_n(., size = 11760, replace = FALSE) %>%
   ##           mutate(admission = round(runif(dim(.)[1], min = 1, max = 2537))) %>%
  ##            mutate(diagnoses = 'NODIAG', procedures = "THR"))
  
  selected_patients_THR <- sample(
    x = nrow(simulated_data),
    size = 11760,
    replace = FALSE
  )
  
  select_df <- simulated_data[selected_patients_THR, ]
  select_df$admission <- round(runif(nrow(select_df), min = 1, max = n_days))
  select_df$diagnoses <- "NODIAG"
  select_df$procedures <- "THR"
  
  simulated_data <- rbind(simulated_data, select_df)
  
   # Risk of THR 3000 (+5/7) within 6 months before PE   
##  bind_rows(.,
##            filter(., diagnoses %in% 'PE') %>% 
##              sample_n(size = 4200, replace = FALSE) %>%
##              mutate(admission = admission - round(rexp(dim(.)[1])*40)) %>%
##              mutate(diagnoses = 'NODIAG', procedures = "THR")
##  )
  sel_diag_PE <- simulated_data[simulated_data$diagnoses == "PE",]
  
  sample_PE <- sample(x = seq_len(nrow(sel_diag_PE)), size = 4200, replace = FALSE)
  df_PE <- sel_diag_PE[sample_PE, ]
  df_PE$admission <- df_PE$admission - round(rexp(nrow(df_PE))*40)
  df_PE$diagnoses <- "NODIAG"
  df_PE$procedures <- "THR"
  
  simulated_data <- rbind(simulated_data, df_PE)
  
  ##filter(admission > 0) %>%
  simulated_data <- simulated_data[simulated_data$admission > 0, ]

    ##mutate(year = as.Date(admission, origin = "2007-01-01") %>% format("%Y") %>% as.numeric) %>%
  simulated_data$year <- as.integer(format(
    x = as.Date(simulated_data$admission, origin = start_date),
    "%Y"
  ))
    
  ##  mutate(length_of_stay = 1) %>%
  ##  mutate(id_stay = 1:dim(.)[1]) %>%
  simulated_data$length_of_stay <- 1
  simulated_data$id_stay <- seq_len(nrow(simulated_data))
  
  simulated_data$diagnoses <- as.factor(simulated_data$diagnoses)
  simulated_data$procedures <- as.factor(simulated_data$procedures)
  
  return(as.data.table(simulated_data))
}
jomuller/ITCARES documentation built on May 19, 2019, 7:26 p.m.