Nothing
#' Compute monthly co-occurrence matrix
#'
#' @param df_ehr Input data frame, monthly counts with columns Patient, Month,
#' Parent_Code, Count
#' @param uniq_codes Not required, useful for sql_cooc function
#' @param n_cores Number of cores
#' @param min_code_freq Filter matrix based on feature frequency
#' @param gc_before_parallel Call garbage collector before computation
#'
#' @return Co-occurrence sparse matrix
#'
#' @examples
#'
#' df_ehr = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4),
#' Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4),
#' Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1', 'C2',
#' 'C3', 'C4'),
#' Count = 1:9)
#'
#' spm_cooc = build_df_cooc(df_ehr)
#'
#' @export
build_df_cooc = function(df_ehr, uniq_codes = NULL, n_cores = 1,
min_code_freq = 5, gc_before_parallel = TRUE) {
ord_idxs = df_ehr %$% order(Patient, Month, Parent_Code)
df_ehr = df_ehr[ord_idxs, ]
code_freqs = get_code_freqs(df_ehr)
if (is.null(uniq_codes)) uniq_codes = names(code_freqs)
df_ehr$Parent_Code = match(df_ehr$Parent_Code, uniq_codes)
df_ehr = split(df_ehr[-match('Patient', names(df_ehr))], df_ehr$Patient)
if (gc_before_parallel) gc()
df_ehr = parallel::mclapply(df_ehr, build_spm_cooc_patient, uniq_codes,
mc.cores = n_cores)
df_ehr = validate_errors(df_ehr)
df_ehr = do.call(rbind, df_ehr)
seq_codes = match(names(code_freqs), uniq_codes)
spm_freqs = Matrix::sparseMatrix(seq_codes, seq_codes, x = code_freqs,
triangular = TRUE, check = FALSE,
dims = rep(length(uniq_codes), 2))
if (!is.null(df_ehr)) {
df_ehr = Matrix::sparseMatrix(df_ehr[, 1], df_ehr[, 2], x = df_ehr[, 3],
triangular = TRUE, check = FALSE,
dims = rep(length(uniq_codes), 2),
dimnames = list(uniq_codes, uniq_codes))
}
if (min_code_freq > 0) {
rm_idxs = which(Matrix::diag(spm_freqs) < min_code_freq)
if (length(rm_idxs) > 0) {
spm_freqs = spm_freqs[-rm_idxs, -rm_idxs]
if (!is.null(df_ehr)) df_ehr = df_ehr[-rm_idxs, -rm_idxs]
}
}
if (is.null(df_ehr)) spm_freqs else df_ehr + spm_freqs
}
build_spm_cooc_patient = function(df_ehr, uniq_codes, n_cores) {
df_ehr = split(df_ehr[-match('Month', names(df_ehr))], df_ehr$Month)
df_ehr = df_ehr[sapply(df_ehr, nrow) > 1]
if (length(df_ehr) == 0) return(NULL)
.build_cooc_patient_month(df_ehr, uniq_codes)
}
.build_cooc_patient_month = function(ehr, uniq_codes) {
ehr = lapply(ehr, build_spm_cooc_month, uniq_codes)
ehr = do.call(rbind, ehr)
ehr = Matrix::sparseMatrix(ehr[, 1], ehr[, 2], x = ehr[, 3],
triangular = TRUE, check = FALSE,
dims = c(length(uniq_codes), length(uniq_codes)))
as.matrix(Matrix::summary(ehr))
}
build_spm_cooc_month = function(df_ehr_month, uniq_codes) {
m_cooc = RcppAlgos::comboGeneral(seq_along(df_ehr_month$Parent_Code), 2)
cbind(df_ehr_month$Parent_Code[m_cooc[, 1]],
df_ehr_month$Parent_Code[m_cooc[, 2]],
pmin.int(df_ehr_month$Count[m_cooc[, 1]],
df_ehr_month$Count[m_cooc[, 2]]))
}
validate_errors = function(df_ehr) {
errors = sapply(df_ehr, is, 'try-error')
if (any(errors)) {
warning(sum(errors), ' patients failed')
df_ehr = df_ehr[-which(errors)]
}
df_ehr
}
get_code_freqs = function(df_ehr) {
sapply(split(df_ehr$Count, df_ehr$Parent_Code), sum)
}
spm_to_df = function(spm) {
spm_names = rownames(spm)
spm = setNames(as.data.frame(Matrix::summary(spm)), c('V1', 'V2', 'value'))
data.frame(V1 = spm_names[spm$V1], V2 = spm_names[spm$V2],
value = spm$value)
}
# batch by 6k patients (~8GB)
batch_splits = function(l_df_ehr_month, n_batch) {
if (n_batch == 1) return(list(l_df_ehr_month))
seq_batch = rep(seq_len(n_batch), ceiling(length(l_df_ehr_month) / n_batch))
split(l_df_ehr_month, head(seq_batch, length(l_df_ehr_month)))
}
build_spm_cooc = function(df_cooc, uniq_codes) {
Matrix::sparseMatrix(match(df_cooc$V1, uniq_codes),
match(df_cooc$V2, uniq_codes),
x = df_cooc$value,
triangular = TRUE, check = FALSE,
dims = c(length(uniq_codes), length(uniq_codes)))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.