Nothing
#' Selected controls matching a list of variables from a case
#'
#' Helper function for get_matches()
#' This function wil perform incidence density sampling or exact sampling up to a supplied number
#' of matched controls
#'
#'
#' @param matcher list of character strings defining the matching conditions
#' @param control_pool dataframe of potential controls for matching
#' @param n_controls The number of controls for each case. If replace == FALSE
#' this is a maximum value
#' @param id named vector of length 1 for the variable name and value of the case identifier
#' @param replace logical should sampling of matched controls be with replacement (incident density
#' sampling) or not.
#' @return A dataframe of matched controls or NULL if no controls could be found
match_case <- function(matcher, control_pool, n_controls, id, replace){
matched_controls <- filter_(control_pool, paste(names(id), "!=", id), .dots = matcher)
if(!nrow(matched_controls)) {
cat("No matches for id", id, ".")
return(NULL)
}
if(replace){
matched_controls <- matched_controls %>%
sample_n(size = n_controls, replace = TRUE) %>%
mutate_(matched_case = id)
} else {
n_ <- nrow(matched_controls)
if(0 < n_ & n_ < n_controls) cat(n_)
matched_controls <- matched_controls %>%
sample_n(size = min(n_, n_controls), replace = FALSE) %>%
mutate_(matched_case = id)
}
matched_controls
}
#' Find matched controls for a set of cases
#'
#' This function will provide a set of matched controls for a given set of cases.
#'
#' @export
#' @param cases dataframe of cases
#' @param control_pool dataframe of potential controls to be used for matching
#' @param n_controls number of controls to match to each case
#' @param match_vars character vector of variables in the dataframes to be used to perform the
#' matching
#' @param extra_vars character vector of other variables to be used in the matching to define other
#' conditions
#' @param extra_conditions a character vector of length 1 defining further restrictions on matching
#' @param cores number of cpu cores to be used by multicore (windows users should leave set to 1)
#' @param method The method of selection of controls (see details)
#' @param track logical should a dot be printed to std.out for each case?
#' @param tracker function to track progress of the function (See details)
#' @param diagnosis_date character the name of the variable in the cases and control_pool datasets
#' containing the date of diagnosis (or other event to base the IDM method on). If there is no
#' diagnosis date for a patient, this should be represented by NA
#' @details
#' Setting method to "exact" means that the matched controls are removed from the control pool
#' after each case has been matched. This makes this method not thread safe and so will only
#' run on a single core (and more slowly).
#' Setting method to "incidence_density" is thread safe as the same controls can be used for more than one case.
#' See Richardson (2004) Occup Environ Med 2004;61:e59 doi:10.1136/oem.2004.014472 for a description
#' of IDS matching. Also see the introduction vignette.
#' The tracker variable allows for different outputs to track the progress of the function.
#' This is currently set to ouput a dot for every case matched. A function can be added to the
#' argument For a more verbose tracking, e.g. to track number of cases,
#' set \code{tracker = function() paste0(case_num, ",")}
get_matches <- function(cases, control_pool, n_controls, match_vars, extra_vars,
extra_conditions = NULL, cores = 1,
track = TRUE, tracker = function(case_num) ".",
method = c("incidence_density", "exact"),
diagnosis_date = NULL){
method <- match.arg(method)
if(method == "incidence_density"){
assert_that(!is.null(diagnosis_date))
message("Running Incident Density Sampling on ", cores, " cpu cores.")
cases$pcase <- 1
control_pool$pcase <- 0
control_pool <- bind_rows(cases, control_pool)
bind_rows(mclapply(1:nrow(cases), function(case_num){
CASE <- cases[case_num,]
matcher <- c(lapply(match_vars, function(m) {
paste0(m, " == '", CASE[[m]], "'")
}), wrap_sql_query("is.na(#1) | #1 > '#2'", diagnosis_date, CASE[[diagnosis_date]]))
if(!is.null(extra_conditions)) {
matcher <- append(matcher, expand_string(extra_conditions))
}
id <- CASE[[.ehr$patient_id]]
names(id) <- .ehr$patient_id
if(track) cat(tracker(case_num))
match_case(matcher, control_pool, n_controls, id, replace = TRUE)
}, mc.cores = cores))
} else if (method == "exact"){
message("Running on a single core for Unique Matching.\n",
nrow(control_pool), " controls...")
for(case_num in 1:nrow(cases)){
if(track) cat(tracker(case_num))
CASE <- cases[case_num,]
matcher <- lapply(match_vars, function(m) {
paste0(m, " == '", CASE[[m]], "'")
})
if(!is.null(extra_conditions)) {
matcher <- append(matcher, expand_string(extra_conditions))
}
id <- CASE[[.ehr$patient_id]]
names(id) <- .ehr$patient_id
if(exists("matched_")){
matched_ <- bind_rows(matched_, match_case(matcher, control_pool,
n_controls, id, replace = FALSE))
} else {
matched_ <- match_case(matcher, control_pool,
n_controls, id, replace = FALSE)
}
control_ids_ <- unique(matched_[[names(id)]])
exclude_str_ <- expand_string("! .(names(id)) %in% .(control_ids_)")
control_pool <- control_pool %>% filter_(exclude_str_)
}
matched_
}
}
#' Function for performing matching of controls to cases using the consultation files to generate a
#' dummy index date for controls.
#'
#' Controls are matched on an arbitrary number of categrorical variables and on continuous
#' variables via the \code{extra_conditions} argument. Also the date at \code{index_var} is matched to the
#' eventdate in the consultation files, providing a dummy index date for controls of a consultaton
#' within +/- \code{index_diff_limit} days of the index date.
#'
#' Note that the consultaton files must be in flat-file format (i.e. not as part of the database,
#' but as text (or other filetype, e.g stata dta) files). Set the \code{import_fn} argument to
#' use different file formats (e.g. \code{foreign::read.dta} or \code{readstata13::read.dta13})
#'
#' The \code{extra_conditions} argument can add extra condtions to the matching criteria
#' on top of the matching vars for example you could add "year > 1990". You can wrap calls to
#' expressions in dotted brackets to automatically expand them. This is particularly useful
#' when you want to find the value for each individual case. Each case is denoted by \code{CASE}
#' e.g. "start_date < .(CASE$start_date)" will ensure the start date for controls is prior to
#' the start date for the matched case.
#'
#'
#'
#' @export
#'
#' @param cases A dataframe of cases to which to match controls
#' @param control_pool A dataframe of possible contols to match to cases
#' @param index_var character string of the name of the variable containing index dates
#' @param match_vars character vector detailing the common variables in \code{cases} and
#' \code{control_pool} to match on
#' @param extra_conditions character string detailing other matching constraints (see details)
#' @param index_diff_limit integer number of days before or after the case index date that
#' dummy index dates can be picked from the consultation files
#' @param consult_path path to directory containing consultation files
#' @param n_controls integer the number of controls to attempt to match to each case
#' @param cores integer the number of processor cores to be used in processing
#' @param import_fn function name stipulating the function used to read the consultation files
#' @param \dots extra arguments to be passed to import_fn
#'
#' @return a dataframe of matched controls
#'
match_on_index <- function(cases, control_pool, index_var, match_vars,
extra_conditions = "", index_diff_limit = 90, consult_path,
n_controls = 5, cores = 1, import_fn = read.delim, ...){
message("Finding ", nrow(cases), " cases...")
cons_files <- list.files(consult_path)
bind_rows(mclapply(unique(cases[[ .ehr$practice_id ]]), function(practice){
p_cases <- filter_(cases, expand_string(".(.ehr$practice_id) == .(practice)"))
p_controls <- filter_(control_pool, expand_string(".(.ehr$practice_id) == .(practice)"))
#cons_files[str_detect(cons_files, pattern = str_pad(practice, 3, pad = 0))]
consultations <- import_fn(file.path(consult_path,
cons_files[str_detect(cons_files,
str_pad(practice,
3, pad = 0))]), ...) %>%
filter_(expand_string("! .(.ehr$patient_id) %in% .(p_cases[[.ehr$patient_id]])"))
if(exists("matched_")) rm(matched_)
p_num <- nrow(p_cases)
cat("Start practice", practice, ".")
for(case_num in 1:p_num){
CASE <- p_cases[case_num,]
extra_conditions_ <- expand_string(extra_conditions)
matcher <- lapply(match_vars, function(m) {
paste0(m, " == '", CASE[[m]], "'")
})
if(nchar(extra_conditions_)){
matcher <- append(matcher, extra_conditions_, after = length(matcher))
}
id <- CASE[[ .ehr$patient_id]]
names(id) <- .ehr$patient_id
matched_controls <- filter_(p_controls, .dots = append(paste(names(id), "!=", id),
matcher))
matched_consultations <- filter_(consultations,
expand_string(".(.ehr$patient_id) %in%
.(matched_controls[[.ehr$patient_id]])")) %>%
mutate_(index_diff = expand_string("abs(as.integer( .(.ehr$eventdate) -
.(CASE[[index_var]]) ))")) %>%
filter(index_diff <= index_diff_limit) %>%
group_by_(expand_string(".(.ehr$patient_id)")) %>%
arrange(index_diff) %>%
distinct() %>%
transmute_(index_date = expand_string(".(.ehr$eventdate)"))
matched_controls <- inner_join(matched_controls, matched_consultations,
by = .ehr$patient_id)
n_ <- nrow(matched_controls)
if (n_ == 0){
cat(0)
#next
} else if (0 < n_ & n_ < n_controls){
cat(n_)
} else cat(".")
if(exists("matched_")){
matched_ <- bind_rows(matched_, matched_controls %>%
sample_n(size = min(n_, n_controls), replace = FALSE) %>%
mutate_(matched_case = id))
} else {
matched_ <- matched_controls %>%
sample_n(size = min(n_, n_controls), replace = FALSE) %>%
mutate_(matched_case = id)
}
control_ids_ <- unique(matched_[[names(id)]])
exclude_str_ <- expand_string("! .(names(id)) %in% .(control_ids_)")
p_controls <- p_controls %>% filter_(exclude_str_)
}
cat("Practice", practice, "exhausted.")
matched_
}, mc.cores = cores))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.