### findCondition.R ---
#----------------------------------------------------------------------
## Author: Thomas Alexander Gerds & Christian Torp-Pedersen
## Created: Mar 11 2019 (10:30)
## Version:
## Last-Updated: Jan 3 2020 (12:04)
## By: Thomas Alexander Gerds
## Update #: 37
#----------------------------------------------------------------------
##
### Commentary:
##
### Change Log:
# Core moved to c++
#----------------------------------------------------------------------
##
### Code:
#' @title Find conditions in registry data
#' @description
#' This functions is useful for the very common task of selecting cases based
#' on a code which has complete or partial match to a vector of character vari-
#' ables.
#'
#' The function is designed to search a group of variables (character) for
#' multiple conditions defined in a list of named character vectors. The func-
#' tion will produce a data.table with selected variables for cases where a
#' match is found. In addition a list of names character vectors can have
#' exclusions from the search. This last facility is useful if e.g. all
#' cancer except non melanoma skin cancer is sought. In that case inclusion
#' can have all cancers and the exclusions just the non-melanoma skin cancer.
#'
#' See examples for common use of the output
#' @usage
#' findCondition(data, vars, keep, conditions, exclusions=NULL,
#' match="contain",condition.name="X")
#'
#' @param data Data in which to search for conditions
#' @param vars Name(s) of variable(s) in which to search.
#' @param keep a character vector of the columns in Data.table to keep in
#' output
#' @param conditions A named list of (vectors of) search strings. See examples.
#' @param exclusions A names list of (vectors of) search strings to exclude
#' from the output.
#' @param match A variable to tell how to use the character vectors:
#' "exact"=exactly matches the search string, "contains"=contains the search
#' string, "start"=Starts with the search string, "end"=Ends with the search
#' string
#' @param condition.name Name of variable(s) where values define conditions.
#' The values of this variable are the names from parameter "conditions".
#' @return
#' A data table that includes the "keep-variables" and a variable named
#' \code{condition.name} which #' identifies the condition searched for
#' @examples
#' library(heaven)
#' library(data.table)
#'
#' # find all diagnoses that start with "DT"
#' set.seed(800); adm <- simAdmissionData(800)
#' x <- findCondition(adm,vars=c("diag"),
#' keep=c("pnr","inddto","uddto"),
#' conditions=list(THIS=c("DT")),
#' match="start",condition.name="THAT")
#' x
#' # restrict to first by pnr
#' x[x[,.I[1],by=list(pnr)]$V1]
#' # restrict to last by pnr
#' x[x[,.I[.N],by=list(pnr)]$V1]
#'
#' opr <- data.table(
#' pnr=1:100,opr=paste0(rep(c('A','B'),50),seq(0,100,10)),
#' oprtil=paste0(rep(c('A','C'),50),seq(0,100,10)),
#' odto=101:200
#' )
#' search <- list(Cond1=c('A1','A2'),Cond2=c('B10','B40','B5'),
#' Cond3=c('A1','C20','B2'))
#'
#' excl <- list(Cond2='B100')
#'
#' out <- findCondition(opr,vars=c("opr","oprtil"),
#' keep=c("pnr","odto"),
#' conditions=search, exclusions=excl,
#' match="start",condition.name="cond")
#' ### And how to use the result:
#' # Find first occurence of each condition and then use "dcast" to create
#' # a data.table with vectors corresponding to each condition.
#' test <- out[,list(min=min(odto)),by=c("pnr","cond")]
#' # provide a list of variables with one value each
#' test2 <- dcast(pnr~cond,data=test,value.var="min")
#' test2 # A datatable with first dates of each condition for each pnr, but only
#' # for pnr with at least one condition
#' # Define a condition as present when before a certain index date
#' dates <- data.table (pnr=1:100,basedate=sample(0:200,size=100,replace=TRUE))
#' test3 <- merge(out,dates,by="pnr")
#' test3[,before:=as.numeric(odto<=basedate)] # 1 when condition fulfille
#' test3 <- test3[,list(before=max(before)),by=c("pnr","cond")]
#' test4 <- dcast(pnr~cond,value.var="before",data=test3)
#' test4[is.na(test4)] <- 0 # Converts NAs to zero
#' test4[]
#'
#' @author Christian Torp-Pedersen <ctp@heart.dk>, Thomas A. Gerds <tag@biostat.ku.dk>
#' @export
findCondition <- function (data, vars, keep, conditions,exclusions=NULL, match = "contain",
condition.name = "X")
{
.SD=pnrnum=.N=searchcol=NULL
cond = NULL
if (match=="start") match.num <- 0L
else if (match=="exact") match.num <- 1L
else if (match=="end") match.num <- 2L
else if (match=="contain") match.num <- 3L
else stop("Choise of match not appropriate")
if (!is.character(vars) | !is.character(keep))
stop("Error - vars or keep not character")
if (!class(conditions) == "list" | is.null(names(conditions)))
stop("Error - Conditions not a named list")
if(!is.null(exclusions) & (!class(conditions) == "list" | is.null(names(conditions))))
stop("Error - Exclusions should be NULL or a named list")
requireNamespace("data.table")
setDT(data)
conditions <- copy(conditions)
exclusions <- copy(exclusions)
for (variable in unique(c(vars, keep))) {
if (!variable %in% names(data))
stop(paste0("Error - ", variable, " not in data to be analysed"))
}
#vector of conditions and exclusions - all given the maximal length of any criteria and padding with blanks
condnames <- names(conditions)
num.cond <- length(condnames)
max.cond <- max(sapply(conditions,length))
conditions <- lapply(conditions,function(x){c(x,rep("",max.cond-length(x)))}) # All max length, padded with ""
condnames <- lapply(condnames,function(x){rep(x,max.cond)}) #condnames aligned with conditions
conditions <- unlist(conditions)
condnames <- unlist(condnames)
if(!is.null(exclusions)){
exclnames <- names(exclusions)
num.excl <- length(exclnames)
max.excl <- max(sapply(exclusions,length))
exclusions <- lapply(exclusions,function(x){c(x,rep("",max.excl-length(x)))})
exclnames <- lapply(exclnames,function(x){rep(x,max.excl)}) #exclnames aligned with exclusions
exclusions <- unlist(exclusions)
exclnames <- unlist(exclnames)
}
else{
exclusions <- ""
exclnames <- ""
max.excl <- 0L
num.excl <- 0L
}
# Columnes to keep
keepCols <- data[,.SD,.SDcols=keep]
keepCols[,pnrnum:=1:.N]
# Columns to search
searchCols <- data[,.SD,.SDcols=c(vars)]
searchCols[,pnrnum:=1:.N]
# Make a single vector of searchCols
if(length(vars)==1) setnames(searchCols,vars,"searchcol")
else{
searchCols <- data.table::rbindlist(lapply(1:length(vars),function(x){
cols2 <- searchCols[,.SD,.SDcols=c("pnrnum",vars[x])]
setnames(cols2,vars[x],"searchcol")
cols2
}
))
}
#Remove unnecessary repetitions and blanks
searchCols <- searchCols[!(searchcol=="")]
setkeyv(searchCols,c("pnrnum","searchcol"))
searchCols <- unique(searchCols)
setcolorder(searchCols,c("pnrnum","searchcol"))
#cpp
out <- vectorSearch(searchCols[[1]], # Vector of row numbers for searchCols
searchCols[[2]], # Vector with search values
conditions, # Vector of inclusion conditions
exclusions, # Vector of exclusion conditions
condnames, # Names of conditions - same length as conditions
exclnames, # Names of exclusions - same length as exclusions
num.cond, # Number of inclusion criteria
max.cond, # Number of inclusions in each block
num.excl, # Number of of exclusion criteria
max.excl, # Number of exclusion criteria
length(searchCols[[1]]),# Length of searchCols vector
match.num) # 0=start 1=exact 2=end 3=contain
setDT(out)
setnames(out,"X",condition.name)
out <- merge(out,keepCols,by="pnrnum")
out[,pnrnum:=NULL]
out[]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.