R/cooc.R

Defines functions build_spm_cooc batch_splits spm_to_df get_code_freqs validate_errors build_spm_cooc_month .build_cooc_patient_month build_spm_cooc_patient build_df_cooc

Documented in build_df_cooc spm_to_df

#' Compute monthly co-occurrence matrix
#'
#' @param df_ehr Input data frame, monthly counts with columns Patient, Month,
#'               Parent_Code, Count
#' @param uniq_codes Not required, useful for sql_cooc function
#' @param n_cores Number of cores
#' @param min_code_freq Filter matrix based on feature frequency
#' @param gc_before_parallel Call garbage collector before computation
#'
#' @return Co-occurrence sparse matrix
#'
#' @examples
#'
#' df_ehr = data.frame(Month = c(1, 1, 1, 2, 2, 3, 3, 4, 4),
#'                     Patient = c(1, 1, 2, 1, 2, 1, 1, 3, 4),
#'                     Parent_Code = c('C1', 'C2', 'C2', 'C1', 'C1', 'C1', 'C2',
#'                                     'C3', 'C4'),
#'                     Count = 1:9)
#'
#' spm_cooc = build_df_cooc(df_ehr)
#'
#' @export
build_df_cooc = function(df_ehr, uniq_codes = NULL, n_cores = 1,
                         min_code_freq = 5, gc_before_parallel = TRUE) {

  ord_idxs = df_ehr %$% order(Patient, Month, Parent_Code) 
  df_ehr = df_ehr[ord_idxs, ]

  code_freqs = get_code_freqs(df_ehr)

  if (is.null(uniq_codes)) uniq_codes = names(code_freqs)
  df_ehr$Parent_Code = match(df_ehr$Parent_Code, uniq_codes)

  df_ehr = split(df_ehr[-match('Patient', names(df_ehr))], df_ehr$Patient)

  if (gc_before_parallel) gc()

  df_ehr = parallel::mclapply(df_ehr, build_spm_cooc_patient, uniq_codes,
                              mc.cores = n_cores)
   
  df_ehr = validate_errors(df_ehr)
  df_ehr = do.call(rbind, df_ehr)

  seq_codes = match(names(code_freqs), uniq_codes)

  spm_freqs = Matrix::sparseMatrix(seq_codes, seq_codes, x = code_freqs,
		           			       triangular = TRUE, check = FALSE,
		           			       dims = rep(length(uniq_codes), 2))

  if (!is.null(df_ehr)) {
    df_ehr = Matrix::sparseMatrix(df_ehr[, 1], df_ehr[, 2], x = df_ehr[, 3],
                                  triangular = TRUE, check = FALSE,
                                  dims = rep(length(uniq_codes), 2),
                                  dimnames = list(uniq_codes, uniq_codes))
  }

  if (min_code_freq > 0) {

    rm_idxs = which(Matrix::diag(spm_freqs) < min_code_freq)

    if (length(rm_idxs) > 0) {
      spm_freqs = spm_freqs[-rm_idxs, -rm_idxs]
      if (!is.null(df_ehr)) df_ehr = df_ehr[-rm_idxs, -rm_idxs]
    }
  }

  if (is.null(df_ehr)) spm_freqs else df_ehr + spm_freqs
}

build_spm_cooc_patient = function(df_ehr, uniq_codes, n_cores) {

  df_ehr = split(df_ehr[-match('Month', names(df_ehr))], df_ehr$Month)
  df_ehr = df_ehr[sapply(df_ehr, nrow) > 1]
  if (length(df_ehr) == 0) return(NULL)

  .build_cooc_patient_month(df_ehr, uniq_codes)
}

.build_cooc_patient_month = function(ehr, uniq_codes) {

  ehr = lapply(ehr, build_spm_cooc_month, uniq_codes)
 
  ehr = do.call(rbind, ehr)
 
  ehr = Matrix::sparseMatrix(ehr[, 1], ehr[, 2], x = ehr[, 3],
                             triangular = TRUE, check = FALSE,
                             dims = c(length(uniq_codes), length(uniq_codes)))

  as.matrix(Matrix::summary(ehr))
}

build_spm_cooc_month = function(df_ehr_month, uniq_codes) {

  m_cooc = RcppAlgos::comboGeneral(seq_along(df_ehr_month$Parent_Code), 2)

  cbind(df_ehr_month$Parent_Code[m_cooc[, 1]],
        df_ehr_month$Parent_Code[m_cooc[, 2]],
        pmin.int(df_ehr_month$Count[m_cooc[, 1]],
                 df_ehr_month$Count[m_cooc[, 2]]))
}

validate_errors = function(df_ehr) {

  errors = sapply(df_ehr, is, 'try-error')
  if (any(errors)) {
    warning(sum(errors), ' patients failed')
    df_ehr = df_ehr[-which(errors)]
  }

  df_ehr
}

get_code_freqs = function(df_ehr) {
  sapply(split(df_ehr$Count, df_ehr$Parent_Code), sum)
}

spm_to_df = function(spm) {

  spm_names = rownames(spm)
  spm = setNames(as.data.frame(Matrix::summary(spm)), c('V1', 'V2', 'value'))
  
  data.frame(V1 = spm_names[spm$V1], V2 = spm_names[spm$V2],
             value = spm$value)
}


# batch by 6k patients (~8GB)
batch_splits = function(l_df_ehr_month, n_batch) {

  if (n_batch == 1) return(list(l_df_ehr_month))

  seq_batch = rep(seq_len(n_batch), ceiling(length(l_df_ehr_month) / n_batch))

  split(l_df_ehr_month, head(seq_batch, length(l_df_ehr_month)))
}



build_spm_cooc = function(df_cooc, uniq_codes) {

  Matrix::sparseMatrix(match(df_cooc$V1, uniq_codes),
                       match(df_cooc$V2, uniq_codes),
                       x = df_cooc$value,
                       triangular = TRUE, check = FALSE,
                       dims = c(length(uniq_codes), length(uniq_codes)))
}

Try the nlpembeds package in your browser

Any scripts or data that you put into this service are public.

nlpembeds documentation built on April 4, 2025, 4:41 a.m.