R/find_exam.R

Defines functions find_exam

Documented in find_exam

#' @title Find exam data within a given timeframe using parallel CPU computing and possibly shared RAM management.
#' @export
#'
#' @description Finds all, earliest or closest examination to a given timepoints using parallel computing
#'
#' @param d_from data table, the database which is searched to find examinations within the timeframe.
#' @param d_to data table, the database to which we wish to find examinations within the timeframe.
#' @param d_from_ID string, column name of the patient ID column in d_from. Defaults to \emph{ID_MERGE}.
#' @param d_to_ID string, column name of the patient ID column in d_to. Defaults to \emph{ID_MERGE}.
#' @param d_from_time string, column name of the time variable column in d_from. Defaults to \emph{time_rad_exam}.
#' @param d_to_time string, column name of the time variable column in d_to. Defaults to \emph{time_enc_admit}.
#' @param time_diff_name string, column name of the new column created which holds the time difference between the exam and the time provided by d_to. Defaults to \emph{timediff_exam_to_db}.
#' @param before boolean, should times before the given time be considered. Defaults to \emph{TRUE}.
#' @param after boolean, should times after the given time be considered. Defaults to \emph{TRUE}.
#' @param time integer, the timeframe considered between the exam and the d_to timepoints. Defaults to \emph{1}.
#' @param time_unit string, the unit of time used. Time variables are in d_to and d_from are truncated to the supplied time unit.
#' For example: "2005-09-18 08:15:01 PDT" would be truncated to "2005-09-18 PDT" if \emph{time_unit} is set to days.
#' Then the time differences is calculated using \emph{difftime} passing the argument to \emph{units}.
#' The following time units are supported: "secs", "mins", "hours", "days", "months"  and "years" are supported. Defautls to \emph{days}.
#' @param multiple string, which exams to give back. \emph{closest} gives back the exam closest to the time provided by d_to.
#' \emph{all} gives back all occurrences within the timeframe. \emph{earliest} the earliest exam within the timeframe.
#' In case of ties for \emph{closest} or \emph{earliest}, all are returned. Defaults to \emph{closest}.
#' @param add_column string, a column name in d_to to add to the output. Defaults to \emph{NULL}.
#' @param keep_data boolean, whether to include empty rows with only the \emph{d_from_ID} column filed out for cases that have data in the \emph{d_from}, but not within the time range. Defaults to \emph{FALSE}.
#' @param nThread integer, number of threads to use for parallelization. If it is set to 1, then no parallel backends are created and the function is executed sequentially.
#' @param shared_RAM boolean, whether to use shared memory during parallelization using the \emph{bigmemory} package. This allows to process \emph{d_from} and/or \emph{d_to} datasets with >1M rows.
#' Be aware that shared RAM usually results in slower run times, therefore by default it is set to \emph{FALSE},
#' but it allows to run more threads when the datasets are large providing overall faster run times.
#' Be aware that the optimal number of clusters might be different setting it to TRUE or FALSE, and this has to be determined empirically per machine.
#' The feature is very unstable and therefore should only be tried if there is no other option
#'
#' @return data table, with \emph{d_from} filtered to ones only within the timeframe. The columns of \emph{d_from} are returned with the corresponding time column in \emph{data_to}
#' where the rows are instances which comply with the time constraints specified by the function. An additional column specified in \emph{time_diff_name} is also returned,
#' which shows the time difference between the time column in \emph{d_from} and \emph{d_to} for that given case.
#' Also the time column from \emph{d_to} specified by \emph{d_to_time} is returned under the name of \emph{time_to_db}.
#' An additional column specified in \emph{add_column} may be added from \emph{data_to} to the data table.
#'
#' @encoding UTF-8
#'
#' @examples \dontrun{
#' #Filter encounters for first emergency visits at one of MGH's ED departments
#' data_enc_ED <- data_enc[enc_clinic == "MGH EMERGENCY 10020010608"]
#' data_enc_ED <- data_enc_ED[!duplicated(data_enc_ED$ID_MERGE)]
#'
#' #Find all radiological examinations within 3 day of the ED registration
#' rdt_ED <- find_exam(d_from = data_rdt, d_to = data_enc_ED,
#' d_from_ID = "ID_MERGE", d_to_ID = "ID_MERGE",
#' d_from_time = "time_rdt_exam", d_to_time = "time_enc_admit", time_diff_name = "time_diff_ED_rdt",
#' before = TRUE, after = TRUE, time = 3, time_unit = "days", multiple = "all",
#' nThread = 2, shared_RAM = FALSE)
#'
#' #Find earliest radiological examinations within 3 day of the ED registration
#' rdt_ED <- find_exam(d_from = data_rdt, d_to = data_enc_ED,
#' d_from_ID = "ID_MERGE", d_to_ID = "ID_MERGE",
#' d_from_time = "time_rdt_exam", d_to_time = "time_enc_admit", time_diff_name = "time_diff_ED_rdt",
#' before = TRUE, after = TRUE, time = 3, time_unit = "days", multiple = "earliest",
#' nThread = 2, shared_RAM = FALSE)
#'
#' #Find closest radiological examinations on or after 1 day of the ED registration
#' #and add primary diagnosis column from encounters
#' rdt_ED <- find_exam(d_from = data_rdt, d_to = data_enc_ED,
#' d_from_ID = "ID_MERGE", d_to_ID = "ID_MERGE",
#' d_from_time = "time_rdt_exam", d_to_time = "time_enc_admit", time_diff_name = "time_diff_ED_rdt",
#' before = FALSE, after = TRUE, time = 1, time_unit = "days", multiple = "earliest",
#' add_column = "enc_diag_princ", nThread = 2, shared_RAM = FALSE)
#'
#' #Find closest radiological examinations on or after 1 day of the ED registration
#' #but also provide empty rows for patients with exam data but not within the timeframe
#' rdt_ED <- find_exam(d_from = data_rdt, d_to = data_enc_ED,
#' d_from_ID = "ID_MERGE", d_to_ID = "ID_MERGE",
#' d_from_time = "time_rdt_exam", d_to_time = "time_enc_admit", time_diff_name = "time_diff_ED_rdt",
#' before = FALSE, after = TRUE, time = 1, time_unit = "days", multiple = "earliest",
#' add_column = "enc_diag_princ", keep_data = TRUE nThread = 2, shared_RAM = FALSE)
#' }


find_exam <- function(d_from, d_to,
                      d_from_ID = "ID_MERGE", d_to_ID = "ID_MERGE",
                      d_from_time = "time_rad_exam", d_to_time = "time_enc_admit",
                      time_diff_name = "timediff_exam_to_db", before = TRUE, after = TRUE, time = 1, time_unit = "days",
                      multiple = "closest", add_column = NULL, keep_data = FALSE, nThread = parallel::detectCores()-1, shared_RAM = FALSE) {

  if(nThread > 1 & shared_RAM) {
    out <- find_exam_bm(d_from = d_from, d_to = d_to,
                        d_from_ID = d_from_ID, d_to_ID = d_to_ID,
                        d_from_time = d_from_time, d_to_time = d_to_time,
                        time_diff_name = time_diff_name, before = before, after = after, time = time, time_unit = time_unit,
                        multiple = multiple, add_column = add_column, keep_data = keep_data, nThread = nThread)
  } else {
    out <- find_exam_ram(d_from = d_from, d_to = d_to,
                         d_from_ID = d_from_ID, d_to_ID = d_to_ID,
                         d_from_time = d_from_time, d_to_time = d_to_time,
                         time_diff_name = time_diff_name, before = before, after = after, time = time, time_unit = time_unit,
                         multiple = multiple, add_column = add_column, keep_data = keep_data, nThread = nThread)
  }
  return(out)
}

Try the parseRPDR package in your browser

Any scripts or data that you put into this service are public.

parseRPDR documentation built on March 31, 2023, 11:36 p.m.