Nothing
#' MAP algorithm
#'
#' Main function to perform MAP algorithm to calculate predicted
#' probabilities of positive phenotype for each patient
#' based on NLP and ICD counts adjusted for healthcare utilization.
#' For large number of patients (>50k) it may take very long to compute,
#' so a subset_sample parameter is provided to perform the fit on a subset of
#' patients and project the remaining. The subset_sample_size controls the
#' maximum number of patients on which to perform the fit.
#'
#' @param mat Count data (sparse matrix). One of the columns has to be ICD
#' data with name being ICD.
#' @param note Note count (sparse matrix) indicating healthcare utilization.
#' @param yes.con A logical variable indicating if concomitant is desired. Not
#' used for now.
#' @param full.output A logical variable indicating if full outputs are
#' desired.
#' @param subset_sample Logical, perform fit on a subset of patients and
#' project remaining.
#' @param subset_sample_size If subset_sample TRUE, number of patients on which
#' to perform the fit (default 50k).
#' @param verbose Print model information
#'
#' @return Returns a list with following objects:
#' \item{scores}{Indicates predicted probabilities.}
#' \item{cut.MAP}{The cutoff value that can be used to derive binary
#' phenotypes.}
#'
#' @references High-throughput Multimodal Automated Phenotyping (MAP) with
#' Application to PheWAS. Katherine P. Liao, Jiehuan Sun,
#' Tianrun A. Cai, Nicholas Link, Chuan Hong, Jie Huang, Jennifer Huffman,
#' Jessica Gronsbell, Yichi Zhang, Yuk-Lam Ho, Victor Castro, Vivian Gainer,
#' Shawn Murphy, Christopher J. O’Donnell, J. Michael Gaziano, Kelly Cho,
#' Peter Szolovits, Isaac Kohane, Sheng Yu, and Tianxi Cai
#' with the VA Million Veteran Program (2019) <doi:10.1101/587436>.
#'
#' @examples
#' ## simulate data to test the algorithm
#' n = 400
#' ICD = c(rpois(n/4,10), rpois(n/4,1), rep(0,n/2) )
#' NLP = c(rpois(n/4,10), rpois(n/4,1), rep(0,n/2) )
#' mat = Matrix(data=cbind(ICD,NLP),sparse = TRUE)
#' note = Matrix(rpois(n,10)+5,ncol=1,sparse = TRUE)
#' res = MAP(mat = mat, note=note)
#' head(res$scores)
#' res$cut.MAP
#'
#' @export
MAP = function(mat = NULL, note = NULL, yes.con = FALSE, full.output = FALSE,
subset_sample = FALSE, subset_sample_size = 5000,
verbose = TRUE) {
vname = colnames(mat)
if (length(grep("ICD", vname, fixed = TRUE)) == 0) {
stop("Please provide ICD data in mat with colname being ICD")
}
ICD = mat[, grep("ICD", vname, fixed = TRUE)]
IDtab = IDtab(mat, note)
vname.log = paste(vname, "_log", sep = "")
name.all = c(vname, vname.log)
mat.log = mat
mat.log@x = log(1 + mat.log@x)
mat = cbind(mat, mat.log)
rm(mat.log)
colnames(mat) = name.all
note@x = log(note@x + 1)
post.all = Matrix(0, nrow = nrow(mat), ncol = ncol(mat))
prob.all = Matrix(0, nrow = nrow(mat), ncol = ncol(mat))
family = c(rep("poisson", length(vname)), rep("gaussian", length(vname.log)))
for (i in seq_along(name.all)) {
tmpfm = as.formula(paste(name.all[i], "~1"))
if (yes.con) {
tmpfm2 = FLXPconstant()
} else {
tmpfm2 = FLXPconstant()
zero.ind = is.element(mat[, i], 0)
na.ind = (is.element(mat[, i], NA) | is.element(note[, 1], c(0, NA)))
exclude.ind = (zero.ind | na.ind)
dat.tmp = data.frame(mat[!exclude.ind, i], note[!exclude.ind, 1])
colnames(dat.tmp) = c(name.all[i], "note")
}
totalN00 = nrow(dat.tmp)
if (subset_sample && nrow(dat.tmp) > subset_sample_size) {
post_obj = fitproj_flexmix(tmpfm, note, family[i], tmpfm2, dat.tmp,
subset_sample_size)
} else {
fit_obj = fit_flexmix(tmpfm, note, family[i], tmpfm2, dat.tmp)
post_obj = fit_to_posterior_obj(dat.tmp, fit_obj)
}
n.clust = length(unique(post_obj$cluster))
if (n.clust > 1) {
avgcount = post_obj$m_data[,name.all[i]]
tmpdiff = mean(avgcount[post_obj$cluster == 2]) -
mean(avgcount[post_obj$cluster == 1])
tmpind = as.numeric((tmpdiff > 0) + 1)
qprobtmp = qnorm(post_obj$m_post)
qprob = qprobtmp[, tmpind]
qprob[is.infinite(qprob)] = - qprobtmp[is.infinite(qprob), 3 - tmpind]
### deal with some extreme clustering results ###
if (sum(!is.infinite(qprob)) >= 2) {
qprob[qprob == Inf] = max(qprob[!is.infinite(qprob)])
qprob[qprob == -Inf] = min(qprob[!is.infinite(qprob)])
} else if (sum(!is.infinite(qprob)) == 1) {
if (qprob[!is.infinite(qprob)] >= 0) {
qprob[qprob == Inf] = qprob[!is.infinite(qprob)]
qprob[qprob == -Inf] = qnorm(1 / totalN00)
} else {
qprob[qprob == Inf] = qnorm(1 - 1 / totalN00)
qprob[qprob == -Inf] = qprob[!is.infinite(qprob)]
}
} else {
qprob[qprob == Inf] = qnorm(1 - 1 / totalN00)
qprob[qprob == -Inf] = qnorm(1 / totalN00)
}
} else {
qprob = rep(qnorm(1 - 1 / totalN00), nrow(post_obj$m_data))
warning(paste("The clustering step does not converge for variable",
name.all[i]))
}
post.all[!exclude.ind, i] = qprob
if (sum(na.ind) > 0) post.all[na.ind, i] = NA
prob.all[!exclude.ind, i] = pnorm(qprob)
if(sum(na.ind) > 0) prob.all[na.ind, i] = NA
}
colnames(post.all) = name.all
colnames(prob.all) = name.all
## select eligible patients to estimate the prevalence
## and re-scale predicted prob.
prob.all00 = prob.all
na.id = (rowSums(is.na(prob.all00)) == ncol(prob.all00))
prob.all00[is.na(prob.all00)] = 0
mu00 = rowMeans(prob.all00)
exclude.ind = (mu00 == 0)
post.all00 = as.matrix(post.all[!exclude.ind, ])
mat00 = mat[!exclude.ind, ]
rm(post.all)
final.score00 = (rowMeans(pnorm(post.all00), na.rm=TRUE) +
pnorm(rowMeans(post.all00, na.rm=TRUE))) / 2
final.score00[is.na(final.score00)] = NA
avgcount = rowMeans(mat00[, vname.log, drop = FALSE], na.rm = TRUE)
avgpost = rowMeans(post.all00[, vname.log, drop=FALSE], na.rm = TRUE)
avgcount = avgcount[!is.na(avgpost)]
avgpost = avgpost[!is.na(avgpost)]
cluster.ori = kmeans(cbind(avgcount, avgpost), centers=2)
class.pos = 1 + (cor(cluster.ori$cluster == 2, avgcount) > 0)
prev.est0 = mean(cluster.ori$cluster == class.pos)
prev.est = (mean(final.score00, na.rm = TRUE) + prev.est0) / 2
final.score00 = Prob.Scale(final.score00, prev.est)
cut.MAP = as.numeric(quantile(final.score00, 1 - prev.est, na.rm = TRUE))
final.score = Matrix(0, nrow = nrow(mat), ncol=1)
final.score[!exclude.ind, 1] = final.score00
if (sum(na.id) > 0) final.score[na.id, 1] = NA
IDtab = IDtab[IDtab$Freq > 0, ]
if (verbose) {
cat("####################### \n")
cat("MAP only considers patients who have note count data and
at least one non-missing variable\n")
cat("####\nHere is a summary of the input data:\n")
cat("Total number of patients:", sum(IDtab$Freq), "\n")
print(IDtab)
cat("#### \n")
}
final.score[ICD == 0] = 0
if (full.output) {
list("scores" = final.score, "cut.MAP" = cut.MAP, "scores.all" = prob.all)
} else {
list("scores" = final.score, "cut.MAP" = cut.MAP)
}
}
#' Internal function to do inverse logit transformation
#' @noRd
g.logit = function(xx = NULL) exp(xx) / (exp(xx) + 1)
#' Internal function to do logit transformation
#' @noRd
logit = function(xx = NULL) log(xx / (1 - xx))
#' Internal function to get the probabilities that match the prevalence
#' @noRd
Prob.Scale = function(pp = NULL, prev = NULL) {
logit_pp = logit(pp)
logit_pp[logit_pp == -Inf] = min(logit_pp[logit_pp > -Inf], na.rm = TRUE)
logit_pp[logit_pp == Inf] = max(logit_pp[logit_pp < Inf], na.rm = TRUE)
cc = uniroot(function(xx) mean(g.logit(logit_pp - xx), na.rm = TRUE) - prev,
interval = c(-10, 10))$root
g.logit(logit_pp - cc)
}
#' Internal function to get the summary table
#' @noRd
IDtab = function(mat = NULL, note = NULL) {
mat = cbind(mat, note)
colnames(mat)[ncol(mat)] = "note"
df = matrix(c("NO", "YES")[as.matrix(!is.na(mat)) + 1], ncol = ncol(mat))
df = as.data.frame(df)
colnames(df) = colnames(mat)
as.data.frame(table(df))
}
#' @title MAP dictionary
#' @description MAP dictionary that maps phecode to CUIs
#' @format A list of 1866
#' @examples
#' head(phecode.cuis.list)
#' tail(phecode.cuis.list)
"phecode.cuis.list"
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.