R/MAP.R
In MAP: Multimodal Automated Phenotyping

Documented in MAP

#' MAP algorithm
#'
#' Main function to perform MAP algorithm to calculate predicted
#' probabilities of positive phenotype for each patient
#' based on NLP and ICD counts adjusted for healthcare utilization.
#' For large number of patients (>50k) it may take very long to compute,
#' so a subset_sample parameter is provided to perform the fit on a subset of
#' patients and project the remaining. The subset_sample_size controls the
#' maximum number of patients on which to perform the fit.
#'
#' @param mat Count data (sparse matrix). One of the columns has to be ICD
#'            data with name being ICD.
#' @param note Note count (sparse matrix) indicating healthcare utilization.
#' @param yes.con A logical variable indicating if concomitant is desired. Not
#'                used for now.
#' @param full.output A logical variable indicating if full outputs are
#'                    desired.
#' @param subset_sample Logical, perform fit on a subset of patients and
#'                      project remaining.
#' @param subset_sample_size If subset_sample TRUE, number of patients on which
#'                           to perform the fit (default 50k).
#' @param verbose Print model information
#'
#' @return Returns a list with following objects:
#'         \item{scores}{Indicates predicted probabilities.}
#'         \item{cut.MAP}{The cutoff value that can be used to derive binary
#'         phenotypes.}
#'
#' @references High-throughput Multimodal Automated Phenotyping (MAP) with
#' Application to PheWAS. Katherine P. Liao, Jiehuan Sun,
#' Tianrun A. Cai, Nicholas Link, Chuan Hong, Jie Huang, Jennifer Huffman,
#' Jessica Gronsbell, Yichi Zhang, Yuk-Lam Ho, Victor Castro, Vivian Gainer,
#' Shawn Murphy, Christopher J. O’Donnell, J. Michael Gaziano, Kelly Cho,
#' Peter Szolovits, Isaac Kohane, Sheng Yu, and Tianxi Cai
#' with the VA Million Veteran Program (2019) <doi:10.1101/587436>.
#'
#' @examples
#' ## simulate data to test the algorithm
#' n = 400
#' ICD = c(rpois(n/4,10), rpois(n/4,1), rep(0,n/2) )
#' NLP = c(rpois(n/4,10), rpois(n/4,1), rep(0,n/2) )
#' mat = Matrix(data=cbind(ICD,NLP),sparse = TRUE)
#' note = Matrix(rpois(n,10)+5,ncol=1,sparse = TRUE)
#' res = MAP(mat = mat,  note=note)
#' head(res$scores)
#' res$cut.MAP
#'
#' @export
MAP = function(mat = NULL, note = NULL, yes.con = FALSE, full.output = FALSE,
               subset_sample = FALSE, subset_sample_size = 5000,
               verbose = TRUE) {

  vname = colnames(mat)
  if (length(grep("ICD", vname, fixed = TRUE)) == 0) {
    stop("Please provide ICD data in mat with colname being ICD")
  }
  ICD = mat[, grep("ICD", vname, fixed = TRUE)]

  IDtab = IDtab(mat, note)
  vname.log = paste(vname, "_log", sep = "")
  name.all = c(vname, vname.log)

  mat.log = mat
  mat.log@x = log(1 + mat.log@x)
  mat = cbind(mat, mat.log)
  rm(mat.log)

  colnames(mat) = name.all
  note@x = log(note@x + 1)

  post.all = Matrix(0, nrow = nrow(mat), ncol = ncol(mat))
  prob.all = Matrix(0, nrow = nrow(mat), ncol = ncol(mat))
  family = c(rep("poisson", length(vname)), rep("gaussian", length(vname.log)))

  for (i in seq_along(name.all)) {

    tmpfm = as.formula(paste(name.all[i], "~1"))

    if (yes.con) {

      tmpfm2 = FLXPconstant()

    } else {

      tmpfm2 = FLXPconstant()
      zero.ind = is.element(mat[, i], 0)
      na.ind = (is.element(mat[, i], NA) | is.element(note[, 1], c(0, NA)))

      exclude.ind = (zero.ind | na.ind)
      dat.tmp = data.frame(mat[!exclude.ind, i], note[!exclude.ind, 1])
      colnames(dat.tmp) = c(name.all[i], "note")
    }

    totalN00 = nrow(dat.tmp)

    if (subset_sample && nrow(dat.tmp) > subset_sample_size) {

      post_obj = fitproj_flexmix(tmpfm, note, family[i], tmpfm2, dat.tmp,
                                subset_sample_size)

    } else {

      fit_obj = fit_flexmix(tmpfm, note, family[i], tmpfm2, dat.tmp)
      post_obj = fit_to_posterior_obj(dat.tmp, fit_obj)
    }

    n.clust = length(unique(post_obj$cluster))

    if (n.clust > 1) {

      avgcount = post_obj$m_data[,name.all[i]]

      tmpdiff = mean(avgcount[post_obj$cluster == 2]) -
        mean(avgcount[post_obj$cluster == 1])

      tmpind = as.numeric((tmpdiff > 0) + 1)
      qprobtmp = qnorm(post_obj$m_post)

      qprob = qprobtmp[, tmpind]
      qprob[is.infinite(qprob)] = - qprobtmp[is.infinite(qprob), 3 - tmpind]

      ### deal with some extreme clustering results ###
      if (sum(!is.infinite(qprob)) >= 2) {

        qprob[qprob == Inf] = max(qprob[!is.infinite(qprob)])
        qprob[qprob == -Inf] = min(qprob[!is.infinite(qprob)])

      } else if (sum(!is.infinite(qprob)) == 1) {

        if (qprob[!is.infinite(qprob)] >= 0) {

          qprob[qprob == Inf] = qprob[!is.infinite(qprob)]
          qprob[qprob == -Inf] = qnorm(1 / totalN00)

        } else {

          qprob[qprob == Inf] = qnorm(1 - 1 / totalN00)
          qprob[qprob == -Inf] = qprob[!is.infinite(qprob)]

        }
      } else {

        qprob[qprob == Inf] = qnorm(1 - 1 / totalN00)
        qprob[qprob == -Inf] = qnorm(1 / totalN00)
      }

    } else {

      qprob = rep(qnorm(1 - 1 / totalN00), nrow(post_obj$m_data))
      warning(paste("The clustering step does not converge for variable",
                    name.all[i]))
    }

    post.all[!exclude.ind, i] = qprob

    if (sum(na.ind) > 0) post.all[na.ind, i] = NA

    prob.all[!exclude.ind, i] = pnorm(qprob)
    
    if(sum(na.ind) > 0) prob.all[na.ind, i] = NA
  }

  colnames(post.all) = name.all
  colnames(prob.all) = name.all

  ## select eligible patients to estimate the prevalence
  ## and re-scale predicted prob.
  prob.all00 = prob.all
  na.id = (rowSums(is.na(prob.all00)) == ncol(prob.all00))
  prob.all00[is.na(prob.all00)] = 0
  mu00 = rowMeans(prob.all00)
  exclude.ind = (mu00 == 0)
  post.all00 = as.matrix(post.all[!exclude.ind, ])
  mat00 = mat[!exclude.ind, ]
  rm(post.all)

  final.score00 = (rowMeans(pnorm(post.all00), na.rm=TRUE) +
                     pnorm(rowMeans(post.all00, na.rm=TRUE))) / 2
  final.score00[is.na(final.score00)] = NA

  avgcount = rowMeans(mat00[, vname.log, drop = FALSE], na.rm = TRUE)
  avgpost = rowMeans(post.all00[, vname.log, drop=FALSE], na.rm = TRUE)
  avgcount = avgcount[!is.na(avgpost)]
  avgpost = avgpost[!is.na(avgpost)]

  cluster.ori = kmeans(cbind(avgcount, avgpost), centers=2)
  class.pos = 1 + (cor(cluster.ori$cluster == 2, avgcount) > 0)
  prev.est0 = mean(cluster.ori$cluster == class.pos)
  prev.est = (mean(final.score00, na.rm = TRUE) + prev.est0) / 2

  final.score00 = Prob.Scale(final.score00, prev.est)
  cut.MAP = as.numeric(quantile(final.score00, 1 - prev.est, na.rm = TRUE))
  final.score = Matrix(0, nrow = nrow(mat), ncol=1)
  final.score[!exclude.ind, 1] = final.score00
  if (sum(na.id) > 0) final.score[na.id, 1] = NA

  IDtab = IDtab[IDtab$Freq > 0, ]

  if (verbose) {
    cat("####################### \n")
    cat("MAP only considers patients who have note count data and
        at least one non-missing variable\n")
    cat("####\nHere is a summary of the input data:\n")
    cat("Total number of patients:", sum(IDtab$Freq), "\n")
    print(IDtab)
    cat("#### \n")
  }

  final.score[ICD == 0] = 0

  if (full.output) {
    list("scores" = final.score, "cut.MAP" = cut.MAP, "scores.all" = prob.all)
  } else {
    list("scores" = final.score, "cut.MAP" = cut.MAP)
  }
}
#' Internal function to do inverse logit transformation
#' @noRd
g.logit = function(xx = NULL) exp(xx) / (exp(xx) + 1)

#' Internal function to do logit transformation
#' @noRd
logit = function(xx = NULL) log(xx / (1 - xx))

#' Internal function to get the probabilities that match the prevalence
#' @noRd
Prob.Scale = function(pp = NULL, prev = NULL) {

  logit_pp = logit(pp)
  logit_pp[logit_pp == -Inf] = min(logit_pp[logit_pp > -Inf], na.rm = TRUE)
  logit_pp[logit_pp ==  Inf] = max(logit_pp[logit_pp < Inf], na.rm = TRUE)

  cc = uniroot(function(xx) mean(g.logit(logit_pp - xx), na.rm = TRUE) - prev,
               interval = c(-10, 10))$root

  g.logit(logit_pp - cc)
}

#' Internal function to get the summary table
#' @noRd
IDtab = function(mat = NULL, note = NULL) {

  mat = cbind(mat, note)
  colnames(mat)[ncol(mat)] = "note"

  df = matrix(c("NO", "YES")[as.matrix(!is.na(mat)) + 1], ncol = ncol(mat))
  df = as.data.frame(df)
  colnames(df) = colnames(mat)

  as.data.frame(table(df))
}



#' @title MAP dictionary
#' @description MAP dictionary that maps phecode to CUIs
#' @format A list of 1866
#' @examples
#' head(phecode.cuis.list)
#' tail(phecode.cuis.list)
"phecode.cuis.list"