#' Multiple states
#'
#' @description Sometimes, diseases may have multiple states, for example a different stage of cancer or kidney disease. This function processes \code{comorbid_columns} that have multiple states
#' and processes them to a new \code{master_str} which is made up of 0s and 1s only.
#' The intended use is to run this on data that has multiple states, merge the \code{master_str} to the original data on the basis of the \code{comorbid_column}.
#' The \code{\link{make_stem}} can then be run on this data.
#'
#' @param comorbid_column A vector of character strings made up of numbers 0-9 or of factors coercible to character, all should be identical lengths.
#' @param dis_names A list of disease names associated with each position as used in \code{\link{get_disease_counts}}
#'
#' @returns If \code{dis_names} is passed, then a list, the first element of which is a data.frame, the second is a vector of new names.
#' If only \code{comorbid_column} is passed, then only a data.frame is returned which includes:
#' \describe{
#' \item{comorbid_column}{The original \code{comorbid_column} passed to the function}
#' \item{Freq}{The frequency of that \code{comorbid_column}}
#' \item{\code{master_str}}{The new string that has been returned}
#'}
#' @examples
#' comorbid_column <- c('23001', '11101', '21011', '22111', '23001')
#' disease_names = c('first', 'second', 'third', 'fourth', 'fifth')
#'
#' ms <- multiple_state_processor(comorbid_column) #no names
#' ms <- multiple_state_processor(comorbid_column, dis_names = disease_names)
#'
#' @export
multiple_state_processor <- function(comorbid_column, dis_names = NULL){
if(!check_strings_equal(comorbid_column)){stop('comorbid column must be the same length for each record')}
if(any(grepl('[^0-9]',comorbid_column) == TRUE))stop('There are non-numeric characters in the comorbid strings, please remove')
comorbid_column <- to_string(comorbid_column)
# number of items in string
n <- nchar(comorbid_column[1])
# maximal state per item
max_by_po <- sapply(1:n, function(x) max(as.numeric(substr(comorbid_column, x, x))))
# remake comorbid_column based on max, value becomes relevant pos that ==1
# e.g. for a position with max 3, 000 = no disease, 100 = 1, 010 = 2, 001 = 3
# define a new position for each string
tt <- data.frame(table(comorbid_column))
# now we need to assign elements
tt$master_str <- character(nrow(tt))
# iterate over each position of the string
for(pos in 1:n){
# identify the maximal number for each position
max_n <- max_by_po[pos]
# identify the relevant element from the string
element <- as.numeric(substr(tt$comorbid_column, pos, pos))
#find which rows have which elements
if(max_n == 1){
# if 1/0, then rows == those where 1 is present
rows <- which(element=='1')
}else{
#otherwise, make a vector of rows which have which values
str_positions <- sapply(1:max_n, function(x) which(element == x))
}# now replace those elements
# create empty string to work with
tt$str <- paste0(rep('0', max_n), collapse='')
if(max_n == 0){
# if there are NO values (i.e. 0 in all strings), then simply paste empty column to str
cat("empty column - I'll fill it with 0")
tt$str <- paste0(rep('0', 1), collapse='')
}
else{
# if max_n >= 0
# Iterate over each element up to the maximum for that setting
for(position in 1:max_n){
# if only one element, we made rows above, so skip this rows creation
if(exists('rows')==FALSE){
rows <- str_positions[[position]]
}
# If there aren't any rows with this position count, next loop after removing rows
if(length(rows) == 0){rm(rows);next}
#select rows where the criteria is met
int_str <- tt$str[rows]
# paste a '1' in the relevant position within the string
substr(int_str, position, position) <- '1'
# put back into tt
tt[rows, 'str'] <- int_str
# remove rows so we can test with exists above.
rm(rows)
}
}
# paste togther str and master_str to make new master_str
tt[,'master_str'] <- apply(tt, 1, function(x) paste0(x['master_str'], x['str']))
# clean up - remove str for this loop
tt$str <- NULL
}
# dis_name handling
if(!is.null(dis_names)){
if(length(dis_names) != n)stop('Disease names should be the same length as comorbid string')
# replicate each dis_name by the maximal position number
d_name <- rep(dis_names, max_by_po)
# create numbers
numbering <- lapply(max_by_po, function(x) 1:x)
# make to vector
numbers_ready <- Reduce(c, numbering)
# paste together elemtwise
new_names <- do.call(paste0, list(d_name, numbers_ready))
# return tt and names
return(list(tt, new_names))
}
cat('If you provide column names I will reassign them for you \n')
# only return tt
return(tt)
}
.identify_multiple_states <-function(comorbid_columns){
# staged identification
# find any strings with non-numeric, these should stop
if(any(grepl('[^0-9]',comorbid_columns) == TRUE))stop('There are non-numeric characters in the comorbid strings, please remove')
##find those that aren't 0-1
if(any(grepl('[^0-1]', comorbid_columns) == TRUE))
# If we find non 0-1 --> multiple states
{return('multiple')}
else
# Otherwise it is single state
{return('single')}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.