predtools: Prediction Model Tools

Documented in calc_mROC_stats mAUC mROC mROC_analysis mROC_inference

mROC_class_template<-list(p=NA,FPs=NA,TPs=NA)
class(mROC_class_template)<-"mROC"


mROC_inference_template<-list(n_sim=NA,stats=c(A=NA,B=NA,A.dir=NA),pvals=c(A=NA,B=NA),null_stats=c(A.mu=NA,A.se=NA,B.mu=NA,b.se=NA),stat=c(value=NA,df=NA),pval=NA)
class(mROC_inference_template)<-"mROC_inference"

#' @export
print.mROC_inference<-function(x,...)
{
  obj <- x
  cat("Mean calibration statistic (A):",obj$stats['A'],"(",ifelse(obj$stats['A.dir'],"Obs>Pred","Obs<Pred") ,") (p:",obj$pvals['A'],")\nmROC/ROC equality statsitic (B):",obj$stats['B']," (p:",obj$pvals['B'],")\nUnified statistic:",obj$stat['value']," (df:",obj$stat['df'],",p:",obj$pval,")",sep ="")
  return(invisible(obj$pval))
}


aux<-environment()


#' @export
plot.mROC<-function(x,...)
{
  mROC_obj <- x
  step <- list(...)$step
  
  if(is.null(step)) step <- FALSE
  
  if(step)
  {
    sf<-stepfun(mROC_obj$FPs,c(0,mROC_obj$TPs))
    plot(sf,xlim=c(1,0),ylim=c(0,1),type='l',xlab="Specificity",ylab="Sensitivity",...) 
  }
  else
  {
    #TODO: let possible xlim and ylim from ... override the default
    plot(1-mROC_obj$FPs,mROC_obj$TPs,xlim=c(1,0),ylim=c(0,1),type='l',xlab="Specificity",ylab="Sensitivity",...) 
  }
}


#' @export
lines.mROC<-function(x,...)
{
  mROC_obj <- x
  #sf<-stepfun(mROC_obj$FPs,c(0,mROC_obj$TPs))
  #TODO: let possible xlim and ylim from ... override the default
  xlim<-par("usr")[1:2]
  
  x<-mROC_obj$FPs
  
  if(xlim[1]>xlim[2]) x<-1-x
  
  lines(x,mROC_obj$TPs,type='l',...) 
}


#' @title Calculates mROC from the vector of predicted risks
#' Takes in a vector of probabilities and returns mROC values (True positives, False Positives in an object of class mROC)
#' @param p A numeric vector of probabilities.
#' @param ordered Optional, if the vector p is ordered from small to large (if not the function will do it; TRUE is to facilitate fast computations).
#' @return This function returns an object of class mROC. It has three vectors: thresholds on predicted risks (which is the ordered vector of input probabilities), false positive rates (FPs), and true positive rates (TPs). You can directly call the plot function on this object to draw the mROC
#' @export
mROC<-function(p, ordered=FALSE)
{
  if(min(p)<0 || max(p)>1) {stop("Error: invalid probability vector."); return(-1); }
  if(!ordered) p<-p[order(p)]

  sumP1<-sum(p)
  sumP0<-sum(1-p)

  n<-length(p)

  tp<-0
  fp<-0

  roc<-cbind(fp=rep(NA,length(p)),tp=rep(NA,length(p)))

  for(i in n:1)
  {
    fp<-fp+(1-p[i])/sumP0
    tp<-tp+p[i]/sumP1
    roc[n-i+1,]<-c(fp,tp)
  }

  out<-mROC_class_template
  out$p=p
  out$FPs<-c(0,roc[,1])
  out$TPs<-c(0,roc[,2])
  
  return(out)
}



#' Takes in a mROC object and calculates the area under the curve
#' @param mROC_obj An object of class mROC
#' @return Returns the area under the mROC curve
#' @export
mAUC<-function(mROC_obj)
{
  l<-length(mROC_obj$FPs)
  x<-mROC_obj$FPs[-1]-mROC_obj$FPs[-l]
  a<-sum(x*mROC_obj$TPs[-1])
  b<-sum(x*mROC_obj$TPs[-l])
  return((a+b)/2)
}




#' Calculates the absolute surface between the empirical and expected ROCs
#' @param y y vector of binary responses
#' @param p p vector of predicted probabilities (same length as y)
#' @param ordered defaults to false
#' @param fast defaults to true
#' @return Returns a list with the A (mean calibration statistic) and B (mROC/ROC equality statistic) as well as the direction of poential miscalibration (sign of the difference between the ctual and predicted mean risk)
#' @export
calc_mROC_stats<-function(y, p, ordered=FALSE, fast=TRUE)
{
  if(!ordered)
  {
    o<-order(p)
    p<-p[o]
    y<-y[o]
  }
  
  if(fast)
  {
    tmp<-Ccalc_mROC_stats(p,y)  #Warning: The C code still takes p as first argument
    return(c(A=tmp[1],B=tmp[2],A.dir=(mean(y-p)>0)))
  }
  
  n0<-length(which(y==0))
  n1<-length(which(y==1))
  n<-n0+n1
  sumP1<-sum(p)
  sumP0<-sum(1-p)
  
  xo<-0
  xe<-0
  yo<-0
  ye<-0
  io<-n
  ie<-n
  
  B<-0
  
  step<-0
  
  #plot(c(0,1),c(0,1))
  
  while(io>0 && ie>0)
  {
    if(xo<xe) #xo is behind and has to make a jump
    {
      if(y[io]==1)
      {
        step<-0
        yo<-yo+1/n1
      }
      else
      {
        step<-1/n0
        B<-B+abs(yo-ye)*min(step,xe-xo)
      }
      xo<-xo+step
      io<-io-1
    }
    else #now xe is behind
    {
      step<-(1-p[ie])/sumP0
      B<-B+abs(yo-ye)*min(step,xo-xe)
      xe<-xe+step
      ye<-ye+p[ie]/sumP1
      ie<-ie-1
    }
    #lines(xe,ye,col='red',type='o')
    #lines(xo,yo,type='o')
  }
  
  tmp<-mean(y-p)
  
  return(list(A=abs(tmp),B=B,A.dir=(tmp>0)))
}




#' Statistical inference for comparing empirical and expected ROCs. If CI=TRUE then also returns pointwise CIs
#' 
#' @param p vector of probabilities
#' @param y vector of binary response values
#' @param n_sim number of Monte Carlo simulations to calculate p-value
#' @param CI optional. Whether confidence interval should be calculated for each point of mROC. Default is FALSE.
#' @param aux aux optional. whether additional results (component-wise p-values etc) should be written in the package's aux variable. Default is FALSE.
#' @param fast fast optional. Whether the fast code (C++) or slow code (R) should be called. Default is TRUE (R code will be slow unless the dataset is small)
#' @return Returns an object of type mROC_inference containing the results of statistical inference for the mROC curve
#' @export

mROC_inference<-function(y,p,n_sim=100000,CI=FALSE,aux=FALSE,fast=TRUE)
{
  conditional <- FALSE
  out<-mROC_inference_template
  
  out$n_sim<-n_sim

  n<-length(p)

  if(aux)
  {
    aux$AB<<-matrix(NA, nrow =n_sim, ncol=2)
  }

  o<-order(p)
  p<-p[o]
  y<-y[o]
  
  if(conditional)
  {
    stop("Not implemented yet.")
  }
  else #If conditional
  {
    if(fast)
    {
      n1<-sum(y)
      
      tmp<-Csimulate_null_mROC_stats_unconditional(p,n_sim)
      
      stats<-calc_mROC_stats(y,p)
      out$stats<-stats
  
      out$null_stats<-c(A.mu=mean(tmp[,1]),A.se=sqrt(var(tmp[,1]/length(tmp[,1]))),B.mu=mean(tmp[,2]),B.se=sqrt(var(tmp[,2]/length(tmp[,2]))))
  
      cdf1<-ecdf(x = tmp[,1])
      cdf2<-ecdf(x = tmp[,2])
  
      p1<-1-cdf1(stats[1])
      p2<-1-cdf2(stats[2])
      d<- -2*(log(p1)+log(p2))
  
      p1s<-cdf1(tmp[,1])
      p2s<-cdf2(tmp[,2])
      ds<--2*(log(p1s)+log(p2s))
      var_ds<-var(ds)
      e_ds<-mean(ds)
      c<-var_ds/2/e_ds
      k<-2*e_ds^2/var_ds
  
      out$stat<-c(value=d/c,df=k)
  
      p3<-1-pchisq(q = d/c, df = k)
  
      out$pval<-p3
      out$pvals<-c(A=p1,B=p2)
  
      if(aux)
      {
        p3s<-1-pchisq(q = ds/c, df = k)
        aux$ds<<-tmp
        aux$pvals<<-cbind(p1s,p2s,p3s)
      }
    }
    else #if (fast)
    {
      tmp<-matrix(NA,n_sim,2)
      sns<-matrix(NA,nrow = n+1, ncol = n_sim)
      for(i in 1:n_sim)
      {
        y<-rbinom(n,size = 1,prob=p)
        if(CI)
        {
          res<-roc(y,p,quiet = T)
          sns[,i]<-coords(res,(0:n)/n,input="specificity",transpose=FALSE)[,"sensitivity",]
        }
        tmp[i,]<-calc_mROC_stats(y,p,ordered = TRUE)
  
        if(aux)
        {
          aux$AB[i,]<<-c(mean(y)-mean(p),tmp[i])
        }
      }
    }
  } #else conditional
  if(CI)
  {
    tmp2<-apply(sns,MARGIN = 1,FUN = ecdf)
    out$low<-unlist(lapply(tmp2,quantile,0.025))
    out$high<-unlist(lapply(tmp2,quantile,0.975))
  }


  return(out)
}



#' Main eROC analysis that plots ROC and eROC
#' 
#' @param y y vector of observed responses.
#' @param p p vector of predicted probabilities (the same length as observed responses)
#' @param inference 0 for no inference, 1 for p-value only, and 2 for p-value and 95 percent CI.
#' @param n_sim number of simulations
#' @param fast defaults to true
#' @return returns a list containing the results of mROC analysis.
#' @export
mROC_analysis<-function(y,p,inference=0, n_sim, fast=TRUE)
{
  if(inference==2 && fast) stop("Confidence intervals are currently only available when fast=FALSE")
  
  roc_data<-pROC::roc(y,p)
  plot(roc_data)

  out<-list()
  out$roc_data<-roc_data

  message("AUC is ",roc_data$auc)
  
  res<-mROC(p)
  out$mROC_data<-res
  
  message("mAUC is ",mAUC(res))

  lines(1-res$FPs,res$TPs,col='red')

  if(inference)
  {
    inf<-mROC_inference(y=y, p=p, CI=(inference==2), n_sim = n_sim,  fast=fast)
    if(inference==2)
    {
      n<-length(p)
      lines((0:n)/n,inf$low,type='l',col="gray")
      lines((0:n)/n,inf$high,type='l',col="gray")
    }
    out$inference<-inf
    message("Test statistic is ",inf$stat,"\n")
    message("p-value is ",inf$pval,"\n")
  }

  return(out)
}