R/varImpact.R

#' Variable Impact Estimation
#' \code{varImpact} returns variable importance statistics ordered 
#' by stattistical significance
#'
#' Returns ordered estimates of variable importance measures using 
#' a combination of data-adaptive target parameter estimation, and 
#' targeted maximum likelihood estimation (TMLE).
#' 
#'  The function performs the following functions.
#'  \enumerate{
#'  \item Drops variables missing > miss.cut of time (tuneable).
#'  \item Separate out covariates into factors and continuous (ordered).
#'  \item Drops variables for which there distribution is uneven  - e.g., all 1 value (tuneable) 
#'  separately by for factors and numeric variables (ADD MORE DETAIL HERE)
#'  \item Changes all factors to remove spaces (used for naming dummies later)
#'  \item Changes variable names to remove spaces
#'  \item Makes dummy variable basis for factors, including naming dummies
#'  to be traceable to original factor variable laters
#'  \item Makes new ordered variable of integers mapped to intervals defined by deciles for the ordered numeric variables (automatically makes) 
#'  fewer categories if original variable has < 10 values.
#'  \item Creates associated list of number of unique values and the list of them
#'  for each variable for use in variable importance part.
#'  \item Makes missing covariate basis for both factors and ordered variables
#'  \item For each variable, after assigning it as A, uses 
#'  optimal histogram function to combine values using the 
#'  distribution of A | Y=1 to avoid very small cell sizes in 
#'  distribution of Y vs. A (tuneable) (ADD DETAIL)
#'  \item Uses HOPACH to cluster variables associated confounder/missingness basis for W,
#'  that uses specified minimum number of adjustment variables.
#'  \item Finds min and max estimate of E(Ya) w.r.t. a. after looping through
#'  all values of A* (after processed by histogram)
#'  \item Returns estimate of E(Ya(max)-Ya(min)) with SE 
#'  \item Returns 3 latex table files:
#'  \itemize{
#'    \item AllReslts.tex - the file with cross-validated average variable impacts ordered by statistical significance.
#'    \item byV.tex - the comparison levels used within each validation sample.  Either integer ordering of factors or short-hand for percentile cut-off (0-1 is the 10th percentile, 10+ is the 100th percentile)
#'    \item ConsistReslts.tex - the ``consistent'' significant results, meaning those with consistent categories chosen as comparison groups among factors and consistent ordering for numeric variables.
#' }    
#'  \item Things to do include making options to avoid errors include putting 
#' minimum cell size on validation sample of A vs. Y
#' and implementing CV-TMLE (minCell), adding Imports statement
#' in the DESCRIPTION file, making examples, putting authors
#' and references and see also's.  Allow reporting of results that
#' randomly do not have estimates for some of validation samples.
#' }
#' 
#' @param Y outcome of interest (numeric vector)
#' @param data1 data frame of predictor variables of interest for
#' which function returns VIM's.
#' @param Q.library library used by SuperLearner for model of outcome 
#' versus predictors
#' @param g.library library used by SuperLearner for model of 
#' precitor variable of interest versus other predictors
#' @param fam family ("binomial" or "gaussian")
#' @param minYs mininum # of obs with event  - if it is < minYs, skip VIM
#' @param minCell is the cut-off for including a category of 
#' A in analysis, and  presents the minumum of cells in a 2x2 table of the indicator of 
#' that level versus outcome, separately by training and validation
#' sample
#' @param ncov minimum number of covariates to include as adjustment variables (must 
#' be less than # of basis functions of adjustment matrix)
#' @param cothres cut-off correlation with explanatory 
#' variable for inclusion of an adjustment variables
#' @param   dirout directory to write output
#' @param miss.cut eliminates explanatory (X) variables with proportion 
#' of missing obs > cut.off

varImpact=function(Y,data1,V,Q.library= c("SL.gam","SL.glmnet",  "SL.mean","SL.inter2"),
g.library =c("SL.stepAIC"),fam="binomial",minYs=15,minCell=0,ncov=10,
corthres=0.8,dirout=NULL,miss.cut=0.5)   
  {

  
    ###################################   OUTSTANDING PRGRAMMING ISSUES
    ###### 1) How to add "null" columns to data frame?
    
    ### Get missingness for each column
    ######## Function for getting total number missing values for vector
  sum.na=function(x){sum(is.na(x))}
  ######## Applied to Explanatory (X) data frame
  sna=apply(data1,2,sum.na)
  ####### n = # of row
  n=dim(data1)[1]
  ####### missing proportion by variable
  mis.prop = sna/n
  ####### cut-off for eliminating variable for proportion of obs missing
  data1=data1[,mis.prop < miss.cut]
  ###### Identify numeric variables (ordered)
  ind.num=apply(data1,2,is.numeric)
  ## Identify factor variables
  isit.factor = sapply(data1,is.factor)
  ndata1=dim(data1)[2]
  ### Function that counts # of unique values
  num.val=function(x){length(unique(x))}
  ### num.values is vector of number of unique values by variable
  num.values = apply(data1,2,num.val)
  ### Function that returns logical T if no variability by variable
  qq.range = function(x,rr) {
    qq = quantile(unclass(x),probs=rr,na.rm=T)
    (qq[2]-qq[1])==0
  }
  qq.range.num = function(x,rr) {
    qq = quantile(x,probs=rr,na.rm=T)
    (qq[2]-qq[1])==0
  }
  if(sum(isit.factor)==0){
    n.fac=0
  }
  #### data.fac is data frame of variables that are factors
  if(sum(isit.factor)>0){
    data.fac=data1[,isit.factor,drop=F]
  ## Replace blanks with NA's
  nc=dim(data.fac)[2]
  for(i in 1:nc) {
    xx = as.character(data.fac[,i])
    xx = factor(xx,exclude = "")
    data.fac[,i]=xx
  }
  
    dropind = NULL
  for(i in 1:nc) {
    dropind = c(dropind,qq.range(data.fac[,i],rr=c(0.10,0.90)))
  }
  
  data.fac=data.fac[,dropind==F,drop=F]
  ln.unique=function(x){length(unique(x))}
  num.cat = apply(data.fac,2,ln.unique)
  sna=apply(data.fac,2,sum.na)
  n=dim(data.fac)[1]
  mis.prop = sna/n
  data.fac=data.fac[,mis.prop < 0.5,drop=F]
  n.fac=dim(data.fac)[2]
  }
  
  ## Numeric variables
  data.num = data1[,isit.factor==F,drop=F]
  if((dim(data.num)[2])==0) {
    n.num=0}
  if((dim(data.num)[2])>0) {
  dropind = NULL
  nc = dim(data.num)[2]
  for(i in 1:nc) {
    dropind = c(dropind,qq.range.num(data.num[,i],rr=c(0.10,0.90)))
  }
  data.num = data.num[,dropind==F,drop=F]
  n.num=dim(data.num)[2]
  }
  ###########################################################################
  ###########################################################################
  ####### Do following duplicate code modified if there are no factors
  ####### Need to re-code to remove redundancy
  ###########################################################################
  if(n.num>0 & n.fac==0) { 
    ln.unique=function(x){length(unique(x))}
    X=data.num
    xc=dim(X)[2]
    qt=apply(na.omit(X),2,quantile,probs=seq(0.1,0.9,0.1))
    newX=NULL
    coln=NULL
    varn=colnames(X)
    num.cat = apply(X,2,ln.unique)
    ### Processing continuous variables
    Xnew=NULL
    for(k in 1:xc) {
      Xt=X[,k]
      tst=as.numeric(discretize(Xt,method="frequency",categories=10,ordered=T))
      Xnew=cbind(Xnew,tst)
    }
    colnames(Xnew)=varn
    data.cont.dist=Xnew
    ############### 
    ##########  Missing Basis
    xp=dim(data.cont.dist)[2]
    sum.na=function(x){sum(is.na(x))}
    sna=apply(data.cont.dist,2,sum.na)
    nmesX=colnames(data.cont.dist)
    miss.cont=NULL
    nmesm=NULL
    for(k in 1:xp){
      if(sna[k] > 0) {
        ix=as.numeric(is.na(data.cont.dist[,k])==F)
        miss.cont=cbind(miss.cont,ix)
        nmesm=c(nmesm,paste("Imiss_",nmesX[k],sep="")) 
      }
    } 
    colnames(miss.cont)=nmesm
    numcat.cont = apply(data.cont.dist,2,ln.unique)
    xc=length(numcat.cont)
    cats.cont <- lapply(1:xc, function(i) {
      sort(unique(data.cont.dist[,i])) })
    
    
    ## Find the level of covariate that has lowest risk
    data.numW=data.num
    data.numW[is.na(data.num)]=0
    n=length(Y)
    get.tmle.est=function(Y,A,W,delta=NULL,Q.lib,g.lib) {
      ## Because of quirk of program, delete observations
      ## with delta=0 if #>0 & < 10
      n=length(Y)
      inc=rep(TRUE,n)
      if(is.null(delta)==F) {
        ss=sum(delta==0)
        if(ss > 0 & ss < 10) {
          inc[delta==0]=FALSE  
        }
      }
      Y=Y[inc]
      A=A[inc]
      W=W[inc,,drop=F]
      delta=delta[inc]
      tmle.1=tmle(Y, A,W,Delta=delta,  g.SL.library=g.lib,Q.SL.library =Q.lib, family = fam, verbose = FALSE)
      g1=tmle.1$g$g1W
      Qst=tmle.1$Qstar[,2]
      theta=mean(Qst)
      IC=(A/g1)*(Y-Qst)+Qst-theta
      return(list(theta=theta,IC=IC)) 
    }
    
    #### Stratified CV to insure balance (by one grouping variable, Y)
    CC.CV=function(V,Y) {
      nn=length(Y)
      tt=table(Y)
      Ys=unique(Y)
      nys=length(Ys)
      out=rep(NA,nn)
      for(i in 1:nys) {
        n=as.numeric(tt[i])
        xx=cvFolds(n, K=V, R = 1,type = "random")$which
        out[Y==Ys[i]]=xx
      }
      return(out)
    }
    folds=CC.CV(V,Y)
    max.2=function(x) {max(x^2,na.rm=T)}
    
    ###############################################################################
    cor.two=function(x,y) {
      (cor(na.omit(cbind(x,y)))[1,2])^2
    }
    folds=CC.CV(V,Y)
    #  detach("package:cvTools", unload=TRUE)
    #  detach("package:lattice", unload=TRUE)
    #  library(doParallel)
    #  cl <- makeCluster(20)
    #  registerDoParallel(cl)
    names.cont=colnames(data.cont.dist)
    xc=dim(data.cont.dist)[2]
    n.cont=dim(data.cont.dist)[1]
    out.put <- lapply(1:xc, function(i) {
      nameA=names.cont[i]
      thetaV=NULL
      varICV = NULL
      labV=NULL
      EY0V=NULL
      EY1V=NULL
      nV=NULL
      for(kk in 1:V) {
#        cat(" i = ",i," V = ",kk,"\n")
        At=data.cont.dist[folds!=kk,i]
        Av=data.cont.dist[folds==kk,i]
        Yt=Y[folds != kk]
        Yv=Y[folds == kk]
        AY1=At[Yt==1 & is.na(At)==F]
        hh=histogram::histogram(AY1,verbose=F,type="irregular")$breaks
        if(hh[length(hh)]<max(At,na.rm=T)){hh[length(hh)]=max(At,na.rm=T)+0.1}
        if(hh[1] > min(At[At>0],na.rm=T)){hh[1]=min(At[At>0],na.rm=T)-0.1}
        Atnew=cut(At,breaks=hh)
        Avnew=cut(Av,breaks=hh)
        if(length(na.omit(unique(Atnew)))<=1 | length(na.omit(unique(Avnew)))<=1) {
          thetaV=c(thetaV,NA)
          varICV = c(varICV,NA)
          labV=rbind(labV,c(NA,NA))
          EY0V=c(EY0V,NA)
          EY1V=c(EY1V,NA)
          nV=c(nV,NA)
        }
        if(length(na.omit(unique(Atnew)))>1 & length(na.omit(unique(Avnew))) > 1) {
          labs=names(table(Atnew))
          Atnew=as.numeric(Atnew)-1
          Avnew=as.numeric(Avnew)-1
          numcat.cont[i]=length(labs)
          #     change this to match what was done for factors - once 
          #      cats.cont[[i]]=as.numeric(na.omit(unique(Atnew)))
          cats.cont[[i]]=as.numeric(names(table(Atnew)))
          ### acit.numW is just same as data.cont.dist except with NA's replaced by 0's.
          if(is.null(miss.cont)){miss.cont=matrix(0,n.cont,1)}
          Wt=data.frame(data.numW[folds!=kk,-i],miss.cont[folds!=kk,])
          nmesW = names(Wt)
          mtch=match(nmesW,paste("Imiss_",nameA,sep=""))
          Wt=Wt[,is.na(mtch)]
          Wv=data.frame(data.numW[folds==kk,-i],miss.cont[folds==kk,])
          Wv=Wv[,is.na(mtch)]
          ### Pull out any variables that are overly correlated with At (corr coef < corthes)
          corAt=apply(Wt,2,cor.two,y=At)
#          cat("i = ",i," maxCor = ",max(corAt,na.rm=T),"\n")
          incc= corAt < corthres & is.na(corAt)==F
          Wv=Wv[,incc]
          Wt=Wt[,incc]
if(nw <=10) {
  Wtsht=Wt
  Wvsht=Wv
}
if(nw > 10) {
  mydist<-as.matrix(distancematrix(t(Wt),d="cosangle",na.rm=T))
  hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="mean",verbose=FALSE),silent=TRUE)
  if(class(hopach.1)=="try-error"){
    hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="med",verbose=FALSE),silent=TRUE)
  }
  if(class(hopach.1)=="try-error"){
    Wtsht=Wt
    Wvsht=Wv
  }
  if(class(hopach.1) != "try-error") {
    nlvls=nchar(max(hopach.1$final$labels))
    no<-trunc(mean(log10(hopach.1$final$labels)))
    ### Find highest level of tree where minimum number of covariates is > ncov
    lvl = 1:nlvls
    ncv=NULL
    for(ii in lvl) {ncv=c(ncv,length(unique(trunc(hopach.1$final$labels/10^(no-(ii-1))))))}
    ncv=unique(ncv)
    lev = min(min(nlvls,dim(Wt)[2]),min(lvl[ncv>=ncov]))
    two.clust<-unique(trunc(hopach.1$final$labels/(10^(no-(lev-1)))))
    md<-hopach.1$final$medoids
    mm=md[,1]%in%two.clust
    incc=md[mm,2]
    Wtsht=Wt[,incc]
    Wvsht=Wv[,incc]
  }
}
deltat=as.numeric(is.na(Yt)==F & is.na(Atnew)==F)
          deltav=as.numeric(is.na(Yv)==F & is.na(Avnew)==F)
          vals=cats.cont[[i]]
          maxEY1=-100000
          minEY1=1000000
          minj=0
          maxj=0
          ICmin=NULL
          ICmax=NULL
          Atnew[is.na(Atnew)]=-1
          Avnew[is.na(Avnew)]=-1
         if(min(table(Avnew[Avnew >= 0],Yv[Avnew >= 0])) <= minCell) {
            thetaV=c(thetaV,NA)
            varICV = c(varICV,NA)
            labV=rbind(labV,c(NA,NA))
            EY0V=c(EY0V,NA)
            EY1V=c(EY1V,NA)
            nV=c(nV,NA)
          }
          if(min(table(Avnew,Yv)) > minCell) {
            labmin=NULL
            labmax=NULL
            for(j in 1:numcat.cont[i]) {
#              cat(" i = ",i," kk = ",kk," j = ",j,"\n")
              IA = as.numeric(Atnew==vals[j])
              res = get.tmle.est(Yt,IA,Wtsht,deltat,Q.lib=Q.library,g.lib=g.library)
              EY1=res$theta
              if(EY1 < minEY1){
                minj = j
                minEY1=EY1
                labmin=labs[j]}
              if(EY1 > maxEY1){
                maxj = j
                maxEY1=EY1
                labmax=labs[j]
              }    
            }  
            ##### Now, estimate on validation sample
            IA = as.numeric(Avnew==vals[minj])
            res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
            IC0=res$IC
            EY0=res$theta
            IA = as.numeric(Avnew==vals[maxj])
            res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
            IC1=res$IC
            EY1=res$theta
            thetaV=c(thetaV,EY1-EY0)
            varICV = c(varICV,var(IC1-IC0))
            labV=rbind(labV,c(labmin,labmax))
            EY0V=c(EY0V,EY0)
            EY1V=c(EY1V,EY1)
            nV=c(nV,length(Yv)) 
          }
        }
      }
      #print(list(EY1V,EY0V,thetaV,varICV,labV,nV))
      list(EY1V,EY0V,thetaV,varICV,labV,nV)
    }
    )	
    
    vars.cont=colnames(data.cont.dist)
    stf=c("vars.cont","out.cont")
    
    #vars=colnames(data.fac)
    nc=length(vars.cont)
    #nf=length(vars)
    factr=rep("ordered",nc)
    vars=vars.cont
    names(out.put)=vars
    lngth=sapply(out.put,function(x) length(x))
    out.put=out.put[lngth==6]
    vars=vars[lngth==6]
    factr=factr[lngth==6]
    lngth2=sapply(out.put,function(x) length(na.omit(x[[1]])))
    out.put=out.put[lngth2==V]
    vars=vars[lngth2==V]
    factr=factr[lngth2==V]

    tst=lapply(out.put, function(x) x[[3]])
    tst=do.call(rbind,tst)
    tot.na=function(x){sum(is.na(x))}
    xx=apply(tst,1,tot.na)
    out.sht=out.put[xx==0]
    vars=vars[xx==0]
    factr=factr[xx==0]
    
    tst=lapply(out.sht, function(x) x[[1]])
    EY1=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[2]])
    EY0=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[3]])
    theta=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[4]])
    varIC=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[6]])
    nV=do.call(rbind,tst)
    n=sum(nV[1,])
    SEV = sqrt(varIC/nV)
    ##### Get labels for each of the training sample
    labs.get=function(x,fold) {
      lbel=rep(1:2,fold)
      oo=order(lbel)
      lbel=lbel[oo]
      out=as.vector(t(x))
      names(out)=paste("v.",lbel,rep(c("a_L","a_H"),2),sep="")
      out}
    tst=lapply(out.sht, function(x) x[[5]])
    tst=lapply(tst,labs.get,fold=2)
    lbs=do.call(rbind,tst)
    
    
    meanvarIC=apply(varIC,1,mean)
    psi=apply(theta,1,mean)
    SE=sqrt(meanvarIC/n)
    lower=psi-1.96*SE
    upper=psi+1.96*SE
    signdig=3
    CI95=paste("(",signif(lower,signdig)," - ",signif(upper,signdig),")",sep="")
    # 1-sided p-value
    pvalue=1-pnorm(psi/SE)
    #####  FOR THETA (generalize to chi-square test?)
    TT = (theta[,1]-theta[,2])/sqrt(SEV[,1]^2+SEV[,2]^2)
    pval.comp=2*(1-pnorm(abs(TT)))
    #####  FOR levels (just make sure in same order)
    nc=n=length(factr)
    dir=NULL
    ### Ordered variables first
    for(i in 1:V) {
      ltemp=lbs[1:nc,i*2-1]
      xx=regexpr(",",ltemp)
      lwr=as.numeric(substr(ltemp,2,xx-1))
      utemp=lbs[1:nc,i*2]
      xx=regexpr(",",utemp)
      nx=nchar(utemp)
      uwr=as.numeric(substr(utemp,xx+1,nx-1))
      dir=cbind(dir,uwr>lwr)
    }
    length.uniq=function(x){length(unique(x))}
    cons=apply(dir,1,length.uniq)
    
    consist= (cons==1 & pval.comp > 0.05)
procs<-c("Holm","BH")
if(n > 1) {
  res<-mt.rawp2adjp(pvalue,procs)
  oo<-res$index
  outres=data.frame(factor=factr[oo],theta[oo,],psi[oo],
                    CI95[oo],res$adj,lbs[oo,],consist[oo])
}
if(n==1) {
  outres=data.frame(factor=factr,theta,psi,
                    CI95,rawp=pvalue,Holm=pvalue,BH=pvalue,lbs,consist)
}

    outres=outres[is.na(outres[,"rawp"]) ==F,]
    names(outres)[1:5]=c("VarType","psiV1","psiV2","AvePsi","CI95")
    names(outres)[13]="Consistent"
    
    ### Get Consistency Measure and only significant
    outres.cons=outres[outres[,"Consistent"],-13]
    outres.cons=outres.cons[outres.cons[,"BH"]<0.05,]
    
    
    outres.byV=outres[,c(2,3,9:12)]
    print(xtable(outres.byV,caption="data Variable Importance Results By Estimation Sample",label="byV",digits=4),type="latex",file=paste(dirout,"byV.tex",sep=""), caption.placement="top",include.rownames=T)
    
    outres.all=outres[,c(4,5,6,8)]
    print(xtable(outres.all, caption="data Variable Importance Results for Combined Estimates",label="allRes",digits=4),type="latex",file=paste(dirout,"AllReslts.tex",sep=""),caption.placement="top",include.rownames=T)
    
    
    print(xtable(outres.cons[,c(4,5)],caption="Subset of of Significant and ``Consistent'' Results",label="consisRes",digits=4),type="latex",file=paste(dirout,"ConsistReslts.tex",sep=""),caption.placement="top",include.rownames=T)
  }
  ###########################################################################
  ###########################end of just continuous variables  #############
  ###########################################################################
  if(n.num>0 & n.fac > 0) { 
    ## For each factor, apply qq.range function and get rid of those where "true"
    #### data.fac is data frame of variables that are factors
    ##############
    xp=dim(data.fac)[2]
    facnames = names(data.fac)
    nam.fac = function(x,name) {
      nc=nchar(x)
      out = paste(name,substr(x,2,nc),sep="XX")
      out = gsub(" ","",out)  
      return(out)
    }
    newX=NULL
    coln=NULL
    options(na.action='na.pass')
    for(i in 1:xp) {
#      cat(" i = ",i,"\n")
      x=data.fac[,i]
      inds <- model.matrix(~ x - 1)[,-1]
      nmes=colnames(inds)
      if(is.null(nmes)) {nmes2=facnames[i]}
      if(!is.null(nmes)){nmes2=nam.fac(nmes,facnames[i])}
      coln=c(coln,nmes2)
      newX=cbind(newX,inds)
    }
    colnames(newX)=coln
    ## Indexing vector for dummy basis back to original factors
    cc = regexpr("XX",coln)
    ncc=nchar(coln)
    cc[cc<0]=ncc[cc<0]+1
    fac.indx=substr(coln,1,cc-1)
    datafac.dum=newX
    ### Now, make deciles for continuous variables 
    X=data.num
    xc=dim(X)[2]
    qt=apply(na.omit(X),2,quantile,probs=seq(0.1,0.9,0.1))
    newX=NULL
    coln=NULL
    varn=colnames(X)
    num.cat = apply(X,2,ln.unique)
    ### Processing continuous variables
    Xnew=NULL
    for(k in 1:xc) {
      Xt=X[,k]
      tst=as.numeric(discretize(Xt,method="frequency",categories=10,ordered=T))
      Xnew=cbind(Xnew,tst)
    }
    colnames(Xnew)=varn
    data.cont.dist=Xnew
    ############### 
    ##########  Missing Basis
    xp=dim(data.cont.dist)[2]
    n.cont=dim(data.cont.dist)[1]
    sum.na=function(x){sum(is.na(x))}
    sna=apply(data.cont.dist,2,sum.na)
    nmesX=colnames(data.cont.dist)
    miss.cont=NULL
    nmesm=NULL
    for(k in 1:xp){
      if(sna[k] > 0) {
        ix=as.numeric(is.na(data.cont.dist[,k])==F)
        miss.cont=cbind(miss.cont,ix)
        nmesm=c(nmesm,paste("Imiss_",nmesX[k],sep="")) 
      }
    } 
    #if(is.null(miss.cont)){miss.cont=  rep(1,n.cont)}
    colnames(miss.cont)=nmesm
    ############  Missing Basis for Factors
    xp=dim(datafac.dum)[2]
    sna=apply(datafac.dum,2,sum.na)
    nmesX=colnames(datafac.dum)
    miss.fac=NULL
    nmesm=NULL
    for(k in 1:xp){
      if(sna[k] > 0) {
        ix=as.numeric(is.na(datafac.dum[,k])==F)
        datafac.dum[is.na(datafac.dum[,k]),k]=0
        miss.fac=cbind(miss.fac,ix)
        nmesm=c(nmesm,paste("Imiss_",nmesX[k],sep=""))
      }
    } 
    colnames(miss.fac)=nmesm
    
    #### Start Estimator First using All Data
    ###################################################
    ##### Start with continuos variables
    ###################################################
    
    numcat.cont = apply(data.cont.dist,2,ln.unique)
    xc=length(numcat.cont)
    cats.cont <- lapply(1:xc, function(i) {
      sort(unique(data.cont.dist[,i])) })
    
    
    ## Find the level of covariate that has lowest risk
    datafac.dumW=datafac.dum
    datafac.dumW[is.na(datafac.dum)]=0
    data.numW=data.num
    data.numW[is.na(data.num)]=0
    n=length(Y)
    get.tmle.est=function(Y,A,W,delta=NULL,Q.lib,g.lib) {
      ## Because of quirk of program, delete observations
      ## with delta=0 if #>0 & < 10
      n=length(Y)
      inc=rep(TRUE,n)
      if(is.null(delta)==F) {
        ss=sum(delta==0)
        if(ss > 0 & ss < 10) {
          inc[delta==0]=FALSE  
        }
      }
      Y=Y[inc]
      A=A[inc]
      W=W[inc,,drop=F]
      delta=delta[inc]
      tmle.1=tmle(Y, A,W,Delta=delta,  g.SL.library=g.lib,Q.SL.library =Q.lib, family = fam, verbose = FALSE)
      g1=tmle.1$g$g1W
      Qst=tmle.1$Qstar[,2]
      theta=mean(Qst)
      IC=(A/g1)*(Y-Qst)+Qst-theta
      return(list(theta=theta,IC=IC)) 
    }
    
    #### Stratified CV to insure balance (by one grouping variable, Y)
    CC.CV=function(V,Y) {
      nn=length(Y)
      tt=table(Y)
      Ys=unique(Y)
      nys=length(Ys)
      out=rep(NA,nn)
      for(i in 1:nys) {
        n=as.numeric(tt[i])
        xx=cvFolds(n, K=V, R = 1,type = "random")$which
        out[Y==Ys[i]]=xx
      }
      return(out)
    }
    folds=CC.CV(V,Y)
    max.2=function(x) {max(x^2,na.rm=T)}
    
    #detach("package:cvTools", unload=TRUE)
    #detach("package:lattice", unload=TRUE)
    #library(doParallel)
    #cl <- makeCluster(10)
    #registerDoParallel(cl)
    ### Below is to get indexing vectors so that
    ### any basis functions related to current A
    ### that are in covariate matrix can be removed
    names.fac=colnames(data.fac)
    nmes.facW=colnames(datafac.dumW)
    nmes.mfacW=colnames(miss.fac)
    nchar.facW=nchar(nmes.facW)+1
    nchar.mfacW=nchar(nmes.mfacW)+1
    XXm=regexpr("XX",nmes.facW)
    XXm[XXm<0]=nchar.facW[XXm<0]
    XXm2=regexpr("XX",nmes.mfacW)
    XXm2[XXm2<0]=nchar.mfacW[XXm2<0]
    vars.facW=substr(nmes.facW,1,XXm-1)
    vars.mfacW=substr(nmes.mfacW,7,XXm2-1)
    xc=dim(data.fac)[2]
    n.fac=dim(data.fac)[1]
    ###############################################################################
    ###### VIM for Factors
    ###############################################################################
    ###############################################################################
    output <- lapply(1:xc, function(i) {
      nameA=names.fac[i]
       cat(" i = ",i,"Var = ",nameA, " out of ",xc," factor variables","\n")
      thetaV=NULL
      varICV = NULL
      labV=NULL
      EY0V=NULL
      EY1V=NULL
      nV=NULL
      for(kk in 1:V) {
#        cat(" i = ",i," V = ",kk,"\n")
        At=data.fac[folds!=kk,i]
        Av=data.fac[folds==kk,i]
        Yt=Y[folds != kk]
        Yv=Y[folds == kk]
        ### acit.numW is just same as acit.cont.dist except with NA's replaced by 0's.
        mtch1=match(vars.facW,nameA)
        mtch2=match(vars.mfacW,nameA)
        Adum=data.frame(datafac.dum[,is.na(mtch1)==F])
        dumW=datafac.dum[,is.na(mtch1)]
        missdumW=miss.fac[,is.na(mtch2)]
        if(is.null(missdumW)){missdumW=rep(NA,n.fac)}
        if(is.null(miss.cont)){miss.cont=rep(NA,n.fac)}
        if(is.null(dumW)){dumW=rep(NA,n.fac)}
        if(is.null(data.numW)){data.numW=rep(NA,n.fac)}
        W=data.frame(data.numW,miss.cont,dumW,missdumW)
        W=W[, !apply(is.na(W), 2, all),drop=F]
        Wt=W[folds!=kk,,drop=F]
        Wv=W[folds==kk,,drop=F]
        Adum=data.frame(Adum[folds !=kk,])
        ### Pull out any variables that are overly correlated with At (corr coef < corthes)
        corAt=apply(cor(Adum,Wt,use="complete.obs"),2,max.2)
#        cat("i = ",i," maxCor = ",max(corAt,na.rm=T),"\n")
        incc= abs(corAt) < corthres & is.na(corAt)==F
        Wv=Wv[,incc,drop=F]
        Wt=Wt[,incc,drop=F]
        nw=dim(Wv)[2]
        ### Skip of number covariates < 10
if(nw <=10) {
  Wtsht=Wt
  Wvsht=Wv
}
if(nw > 10) {
  mydist<-as.matrix(distancematrix(t(Wt),d="cosangle",na.rm=T))
  hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="mean",verbose=FALSE),silent=TRUE)
  if(class(hopach.1)=="try-error"){
    hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="med",verbose=FALSE),silent=TRUE)
  }
  if(class(hopach.1)=="try-error"){
    Wtsht=Wt
    Wvsht=Wv
  }
  if(class(hopach.1) != "try-error") {
    nlvls=nchar(max(hopach.1$final$labels))
    no<-trunc(mean(log10(hopach.1$final$labels)))
    ### Find highest level of tree where minimum number of covariates is > ncov
    lvl = 1:nlvls
    ncv=NULL
    for(ii in lvl) {ncv=c(ncv,length(unique(trunc(hopach.1$final$labels/10^(no-(ii-1))))))}
    ncv=unique(ncv)
    lev = min(min(nlvls,dim(Wt)[2]),min(lvl[ncv>=ncov]))
    two.clust<-unique(trunc(hopach.1$final$labels/(10^(no-(lev-1)))))
    md<-hopach.1$final$medoids
    mm=md[,1]%in%two.clust
    incc=md[mm,2]
    Wtsht=Wt[,incc]
    Wvsht=Wv[,incc]
  }
}
deltat=as.numeric(is.na(Yt)==F & is.na(At)==F)
        deltav=as.numeric(is.na(Yv)==F & is.na(Av)==F)
#  To avoid crashing TMLE function just drop obs missing A or Y if the total number of missing is < 10
        if(sum(deltat==0)<10){
          Yt=Yt[deltat==1]
          At=At[deltat==1]
          Wtsht=Wtsht[deltat==1,]
          deltat=deltat[deltat==1]
        }
      if(sum(deltav==0)>=10){
        Yv=Yv[deltav==1]
        Av=Av[deltav==1]
        Wvsht=Wvsht[deltav==1,]
        deltav=deltav[deltav==1]
      }
        levA=levels(At)
        minc=apply(table(Av,Yv),1,min)
        minc2=apply(table(At,Yt),1,min)
        vals=levA[pmin(minc,minc2)>minCell]
        num.cat=length(vals)    
        nYt=sum(Yt[is.na(At)==FALSE])
        nYv=sum(Yv[is.na(Av)==FALSE])
        ## Don't do if 1) no more than one category of A left or
        ## if missingness pattern for A is such that there are
        ## few death events left in either (< minYs)
        if(num.cat < 2 | min(nYt,nYv)<minYs) {
          thetaV=c(thetaV,NA)
          varICV = c(varICV,NA)
          labV=rbind(labV,c(NA,NA))
          EY0V=c(EY0V,NA)
          EY1V=c(EY1V,NA)
          nV=c(nV,NA)
        }
        if(num.cat >= 2 & min(nYt,nYv) >= minYs) {  
          labmin=NULL
          labmax=NULL
          maxEY1=-100000
          minEY1=1000000
          minj=0
          maxj=0
          ICmin=NULL
          ICmax=NULL 
          for(j in 1:num.cat) {
#            cat(" j = ",j,"\n")
            IA = as.numeric(At==vals[j])
            IA[is.na(IA)]=0
#            if(min(table(IA,Yt))>=)
            res = get.tmle.est(Yt,IA,Wtsht,deltat,Q.lib=Q.library,g.lib=g.library)
            EY1=res$theta
            if(EY1 < minEY1){
              minj = j
              minEY1=EY1
              labmin=vals[j]}
            if(EY1 > maxEY1){
              maxj = j
              maxEY1=EY1
              labmax=vals[j]
            }    
          }  
          ##### Now, estimate on validation sample
          IA = as.numeric(Av==vals[minj])
          IA[is.na(IA)]=0
          res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
          IC0=res$IC
          EY0=res$theta
          IA = as.numeric(Av==vals[maxj])
          IA[is.na(IA)]=0
          res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
          IC1=res$IC
          EY1=res$theta
          thetaV=c(thetaV,EY1-EY0)
          varICV = c(varICV,var(IC1-IC0))
          labV=rbind(labV,c(labmin,labmax))
          EY0V=c(EY0V,EY0)
          EY1V=c(EY1V,EY1)
          nV=c(nV,length(Yv)) 
        }
      }
      list(EY1V,EY0V,thetaV,varICV,labV,nV,"factor")
      #print(data.frame(EY1V,EY0V,thetaV,varICV,labV,nV))
    }
    )
    ###############################################################################
    #################################  Repeat for Continuous Explanatory Variables
    ###############################################################################
    cor.two=function(x,y) {
      (cor(na.omit(cbind(x,y)))[1,2])^2
    }
    folds=CC.CV(V,Y)
    #  detach("package:cvTools", unload=TRUE)
    #  detach("package:lattice", unload=TRUE)
    #  library(doParallel)
    #  cl <- makeCluster(20)
    #  registerDoParallel(cl)
    names.cont=colnames(data.cont.dist)
    xc=dim(data.cont.dist)[2]
    n.cont=dim(data.cont.dist)[1]
    out.put <- lapply(1:xc, function(i) {
      nameA=names.cont[i]
      cat(" i = ",i,"Var = ",nameA, " out of ",xc," numeric variables","\n")
      thetaV=NULL
      varICV = NULL
      labV=NULL
      EY0V=NULL
      EY1V=NULL
      nV=NULL
      for(kk in 1:V) {
#        cat(" v = ",kk," out of V = ",V,"\n")
        At=data.cont.dist[folds!=kk,i]
        Av=data.cont.dist[folds==kk,i]
        Yt=Y[folds != kk]
        Yv=Y[folds == kk]
        AY1=At[Yt==1 & is.na(At)==F]
        hh=histogram::histogram(AY1,verbose=F,type="irregular")$breaks
        if(hh[length(hh)]<max(At,na.rm=T)){hh[length(hh)]=max(At,na.rm=T)+0.1}
        if(hh[1] > min(At[At>0],na.rm=T)){hh[1]=min(At[At>0],na.rm=T)-0.1}
        Atnew=cut(At,breaks=hh)
        Avnew=cut(Av,breaks=hh)
        if(length(na.omit(unique(Atnew)))<=1 | length(na.omit(unique(Avnew)))<=1) {
          thetaV=c(thetaV,NA)
          varICV = c(varICV,NA)
          labV=rbind(labV,c(NA,NA))
          EY0V=c(EY0V,NA)
          EY1V=c(EY1V,NA)
          nV=c(nV,NA)
        }
        if(length(na.omit(unique(Atnew)))>1 & length(na.omit(unique(Avnew))) > 1) {
          labs=names(table(Atnew))
          Atnew=as.numeric(Atnew)-1
          Avnew=as.numeric(Avnew)-1
          numcat.cont[i]=length(labs)
          #     change this to match what was done for factors - once 
          #      cats.cont[[i]]=as.numeric(na.omit(unique(Atnew)))
          cats.cont[[i]]=as.numeric(names(table(Atnew)))
          ### acit.numW is just same as data.cont.dist except with NA's replaced by 0's.
          if(is.null(miss.cont)){miss.cont=rep(NA,n.cont)}
          if(is.null(miss.fac)){miss.fac=rep(NA,n.cont)}
          if(is.null(datafac.dumW)){datafac.dumW=rep(NA,n.cont)}
          if(is.null(data.numW)){data.numW=rep(NA,n.cont)}
          W=data.frame(data.numW[,-i,drop=F],miss.cont,datafac.dumW,miss.fac)
          W=W[, !apply(is.na(W), 2, all),drop=F]
          Wt=W[folds!=kk,,drop=F]
          Wv=W[folds==kk,,drop=F]
          
          nmesW = names(Wt)
          mtch=match(nmesW,paste("Imiss_",nameA,sep=""))
          Wt=Wt[,is.na(mtch),drop=F]
          Wv=Wv[,is.na(mtch),drop=F]
          ### Pull out any variables that are overly correlated with At (corr coef < corthes)
          corAt=apply(Wt,2,cor.two,y=At)
#          cat("i = ",i," maxCor = ",max(corAt,na.rm=T),"\n")
          incc= corAt < corthres & is.na(corAt)==F
          Wv=Wv[,incc,drop=F]
          Wt=Wt[,incc,drop=F]
          #### Use HOPACH to reduce dimension of W to some level of tree
          nw=dim(Wv)[2]
          ### Skip of number covariates < 10
          if(nw <=10) {
            Wtsht=Wt
            Wvsht=Wv
          }
          if(nw > 10) {
          mydist<-as.matrix(distancematrix(t(Wt),d="cosangle",na.rm=T))
          hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="mean",verbose=FALSE),silent=TRUE)
          if(class(hopach.1)=="try-error"){
          hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="med",verbose=FALSE),silent=TRUE)
          }
          if(class(hopach.1)=="try-error"){
            Wtsht=Wt
            Wvsht=Wv
          }
          if(class(hopach.1) != "try-error") {
            nlvls=nchar(max(hopach.1$final$labels))
          no<-trunc(mean(log10(hopach.1$final$labels)))
          ### Find highest level of tree where minimum number of covariates is > ncov
          lvl = 1:nlvls
          ncv=NULL
          for(ii in lvl) {ncv=c(ncv,length(unique(trunc(hopach.1$final$labels/10^(no-(ii-1))))))}
          ncv=unique(ncv)
          lev = min(min(nlvls,dim(Wt)[2]),min(lvl[ncv>=ncov]))
          two.clust<-unique(trunc(hopach.1$final$labels/(10^(no-(lev-1)))))
          md<-hopach.1$final$medoids
          mm=md[,1]%in%two.clust
          incc=md[mm,2]
          Wtsht=Wt[,incc]
          Wvsht=Wv[,incc]
          }
          }
          deltat=as.numeric(is.na(Yt)==F & is.na(Atnew)==F)
          deltav=as.numeric(is.na(Yv)==F & is.na(Avnew)==F)
if(sum(deltat==0)<10){
  Yt=Yt[deltat==1]
  At=At[deltat==1]
  Wtsht=Wtsht[deltat==1,]
  deltat=deltat[deltat==1]
}
          vals=cats.cont[[i]]
          maxEY1=-100000
          minEY1=1000000
          minj=0
          maxj=0
          ICmin=NULL
          ICmax=NULL
          Atnew[is.na(Atnew)]=-1
          Avnew[is.na(Avnew)]=-1
          if(min(table(Avnew[Avnew >= 0],Yv[Avnew >= 0])) <= minCell) {
            thetaV=c(thetaV,NA)
            varICV = c(varICV,NA)
            labV=rbind(labV,c(NA,NA))
            EY0V=c(EY0V,NA)
            EY1V=c(EY1V,NA)
            nV=c(nV,NA)
          }
          if(min(table(Avnew,Yv)) > minCell) {
            labmin=NULL
            labmax=NULL
            for(j in 1:numcat.cont[i]) {
#              cat(" i = ",i," kk = ",kk," j = ",j,"\n")
              IA = as.numeric(Atnew==vals[j])
              res = get.tmle.est(Yt,IA,Wtsht,deltat,Q.lib=Q.library,g.lib=g.library)
              EY1=res$theta
              if(EY1 < minEY1){
                minj = j
                minEY1=EY1
                labmin=labs[j]}
              if(EY1 > maxEY1){
                maxj = j
                maxEY1=EY1
                labmax=labs[j]
              }    
            }  
            ##### Now, estimate on validation sample
            IA = as.numeric(Avnew==vals[minj])
            res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
            IC0=res$IC
            EY0=res$theta
            IA = as.numeric(Avnew==vals[maxj])
            res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
            IC1=res$IC
            EY1=res$theta
            thetaV=c(thetaV,EY1-EY0)
            varICV = c(varICV,var(IC1-IC0))
            labV=rbind(labV,c(labmin,labmax))
            EY0V=c(EY0V,EY0)
            EY1V=c(EY1V,EY1)
            nV=c(nV,length(Yv)) 
          }
        }
      }
      #print(list(EY1V,EY0V,thetaV,varICV,labV,nV))
      list(EY1V,EY0V,thetaV,varICV,labV,nV,"numeric")
    }
    )	
    vars.cont=colnames(data.cont.dist)
    out.cont=out.put
    stf=c("vars.cont","out.cont")
    #allt=ls()
    #mm=allt%in%stf
    #allt=allt[mm==F]
    #rm(list=allt)
    vars=colnames(data.fac)
    nc=length(vars.cont)
    nf=length(vars)
    factr=c(rep("ordered",nc),rep("factor",nf))
    vars=c(vars.cont,vars)
    out.put=c(out.cont,output)
    names(out.put)=vars
    lngth=sapply(out.put,function(x) length(x))
    out.put=out.put[lngth==6]
    vars=vars[lngth==6]
    factr=factr[lngth==6]
    ## Get rid of any variables that have a validation sample
    ## with no estimates of variable importance
    lngth2=sapply(out.put,function(x) length(na.omit(x[[1]])))
    out.put=out.put[lngth2==V]
    vars=vars[lngth2==V]
    factr=factr[lngth2==V]
    tst=lapply(out.put, function(x) x[[3]])
    tst=do.call(rbind,tst)
    tot.na=function(x){sum(is.na(x))}
    xx=apply(tst,1,tot.na)
    out.sht=out.put[xx==0]
    vars=vars[xx==0]
    factr=factr[xx==0]
    
    #names(out.sht)=vars[xx==0]
    tst=lapply(out.sht, function(x) x[[1]])
    EY1=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[2]])
    EY0=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[3]])
    theta=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[4]])
    varIC=do.call(rbind,tst)
    tst=lapply(out.sht, function(x) x[[6]])
    nV=do.call(rbind,tst)
    n=sum(nV[1,])
    SEV = sqrt(varIC/nV)
    ##### Get labels for each of the training sample
    labs.get=function(x,fold) {
      lbel=rep(1:2,fold)
      oo=order(lbel)
      lbel=lbel[oo]
      out=as.vector(t(x))
      names(out)=paste("v.",lbel,rep(c("a_L","a_H"),2),sep="")
      out}
    tst=lapply(out.sht, function(x) x[[5]])
    tst=lapply(tst,labs.get,fold=V)
    lbs=do.call(rbind,tst)
    
    
    meanvarIC=apply(varIC,1,mean)
    psi=apply(theta,1,mean)
    SE=sqrt(meanvarIC/n)
    lower=psi-1.96*SE
    upper=psi+1.96*SE
    signdig=3
    CI95=paste("(",signif(lower,signdig)," - ",signif(upper,signdig),")",sep="")
    # 1-sided p-value
    pvalue=1-pnorm(psi/SE)
    #####  FOR THETA (generalize to chi-square test?)
    TT = (theta[,1]-theta[,2])/sqrt(SEV[,1]^2+SEV[,2]^2)
    pval.comp=2*(1-pnorm(abs(TT)))
    #####  FOR levels (just make sure in same order)
    nc=sum(factr=="ordered")
    n=length(factr)
    ### Ordered variables first
     cons=NULL
     if(nc > 0) {
     dir=NULL
     for(i in 1:V) {
      ltemp=lbs[1:nc,i*2-1]
      xx=regexpr(",",ltemp)
      lwr=as.numeric(substr(ltemp,2,xx-1))
      utemp=lbs[1:nc,i*2]
      xx=regexpr(",",utemp)
      nx=nchar(utemp)
      uwr=as.numeric(substr(utemp,xx+1,nx-1))
      dir=cbind(dir,uwr>lwr)
    }
    length.uniq=function(x){length(unique(x))}
    cons=apply(dir,1,length.uniq)
     }
    #### Factors
    if(n-nc > 0) {
    lwr=NULL
    uwr=NULL
    for(i in 1:V) {
      lwr=cbind(lwr,lbs[(nc+1):n,i*2-1])
      uwr=cbind(uwr,lbs[(nc+1):n,i*2-1])
    }
    conslwr=apply(lwr,1,length.uniq)
    consupr=apply(uwr,1,length.uniq)
    cons=c(cons,conslwr*consupr)
    }
    consist= (cons==1 & pval.comp > 0.05)
        
    procs<-c("Holm","BH")
    if(n > 1) {
    res<-mt.rawp2adjp(pvalue,procs)
    oo<-res$index
    outres=data.frame(factor=factr[oo],theta[oo,],psi[oo],
                      CI95[oo],res$adj,lbs[oo,],consist[oo])
    }
    if(n==1) {
      outres=data.frame(factor=factr,theta,psi,
                        CI95,rawp=pvalue,Holm=pvalue,BH=pvalue,lbs,consist)
    }
    outres=outres[is.na(outres[,"rawp"]) ==F,]
    names(outres)[1:5]=c("VarType","psiV1","psiV2","AvePsi","CI95")
    names(outres)[13]="Consistent"
    
    ### Get Consistency Measure and only significant
    outres.cons=outres[outres[,"Consistent"],-13]
    outres.cons=outres.cons[outres.cons[,"BH"]<0.05,]
    
    
    #drops = c("VarType","description","Holm,")
    #outres.all=outres[,!(names(outres) %in% drops)]
    outres.byV=outres[,c(2,3,9:12)]
    print(xtable(outres.byV,caption="data Variable Importance Results By Estimation Sample",label="byV",digits=4),type="latex",file=paste(dirout,"byV.tex",sep=""), caption.placement="top",include.rownames=T)
    
    outres.all=outres[,c(4,5,6,8)]
    print(xtable(outres.all, caption="data Variable Importance Results for Combined Estimates",label="allRes",digits=4),type="latex",file=paste(dirout,"AllReslts.tex",sep=""),caption.placement="top",include.rownames=T)
    
    
    print(xtable(outres.cons[,c(4,5)],caption="Subset of of Significant and ``Consistent'' Results",label="consisRes",digits=4),type="latex",file=paste(dirout,"ConsistReslts.tex",sep=""),caption.placement="top",include.rownames=T)
    
  }

###############################################################
###############################################################
#########################  If only have factors for predictors
###############################################################
if(n.fac > 0 & n.num==0) { 
  ## For each factor, apply qq.range function and get rid of those where "true"
  #### data.fac is data frame of variables that are factors
  ##############
  xp=dim(data.fac)[2]
  facnames = names(data.fac)
  nam.fac = function(x,name) {
    nc=nchar(x)
    out = paste(name,substr(x,2,nc),sep="XX")
    out = gsub(" ","",out)  
    return(out)
  }
  newX=NULL
  coln=NULL
  options(na.action='na.pass')
  for(i in 1:xp) {
#    cat(" i = ",i,"\n")
    x=data.fac[,i]
    inds <- model.matrix(~ x - 1)[,-1]
    nmes=colnames(inds)
    if(is.null(nmes)) {nmes2=facnames[i]}
    if(!is.null(nmes)){nmes2=nam.fac(nmes,facnames[i])}
    coln=c(coln,nmes2)
    newX=cbind(newX,inds)
  }
  colnames(newX)=coln
  ## Indexing vector for dummy basis back to original factors
  cc = regexpr("XX",coln)
  ncc=nchar(coln)
  cc[cc<0]=ncc[cc<0]+1
  fac.indx=substr(coln,1,cc-1)
  datafac.dum=newX
  ############  Missing Basis for Factors
  xp=dim(datafac.dum)[2]
  sna=apply(datafac.dum,2,sum.na)
  nmesX=colnames(datafac.dum)
  miss.fac=NULL
  nmesm=NULL
  for(k in 1:xp){
    if(sna[k] > 0) {
      ix=as.numeric(is.na(datafac.dum[,k])==F)
      datafac.dum[is.na(datafac.dum[,k]),k]=0
      miss.fac=cbind(miss.fac,ix)
      nmesm=c(nmesm,paste("Imiss_",nmesX[k],sep=""))
    }
  } 
  colnames(miss.fac)=nmesm
  
  ## Find the level of covariate that has lowest risk
  datafac.dumW=datafac.dum
  datafac.dumW[is.na(datafac.dum)]=0
  n=length(Y)
  get.tmle.est=function(Y,A,W,delta=NULL,Q.lib,g.lib) {
    ## Because of quirk of program, delete observations
    ## with delta=0 if #>0 & < 10
    n=length(Y)
    inc=rep(TRUE,n)
    if(is.null(delta)==F) {
      ss=sum(delta==0)
      if(ss > 0 & ss < 10) {
        inc[delta==0]=FALSE  
      }
    }
    Y=Y[inc]
    A=A[inc]
    W=W[inc,,drop=F]
    delta=delta[inc]
    tmle.1=tmle(Y, A,W,Delta=delta,  g.SL.library=g.lib,Q.SL.library =Q.lib, family = fam, verbose = FALSE)
    g1=tmle.1$g$g1W
    Qst=tmle.1$Qstar[,2]
    theta=mean(Qst)
    IC=(A/g1)*(Y-Qst)+Qst-theta
    return(list(theta=theta,IC=IC)) 
  }
  
  #### Stratified CV to insure balance (by one grouping variable, Y)
  CC.CV=function(V,Y) {
    nn=length(Y)
    tt=table(Y)
    Ys=unique(Y)
    nys=length(Ys)
    out=rep(NA,nn)
    for(i in 1:nys) {
      n=as.numeric(tt[i])
      xx=cvFolds(n, K=V, R = 1,type = "random")$which
      out[Y==Ys[i]]=xx
    }
    return(out)
  }
  folds=CC.CV(V,Y)
  max.2=function(x) {max(x^2,na.rm=T)}
  
  #detach("package:cvTools", unload=TRUE)
  #detach("package:lattice", unload=TRUE)
  #library(doParallel)
  #cl <- makeCluster(10)
  #registerDoParallel(cl)
  ### Below is to get indexing vectors so that
  ### any basis functions related to current A
  ### that are in covariate matrix can be removed
  names.fac=colnames(data.fac)
  nmes.facW=colnames(datafac.dumW)
  nmes.mfacW=colnames(miss.fac)
  nchar.facW=nchar(nmes.facW)+1
  nchar.mfacW=nchar(nmes.mfacW)+1
  XXm=regexpr("XX",nmes.facW)
  XXm[XXm<0]=nchar.facW[XXm<0]
  XXm2=regexpr("XX",nmes.mfacW)
  XXm2[XXm2<0]=nchar.mfacW[XXm2<0]
  vars.facW=substr(nmes.facW,1,XXm-1)
  vars.mfacW=substr(nmes.mfacW,7,XXm2-1)
  xc=dim(data.fac)[2]
  n.fac=dim(data.fac)[1]
  ###############################################################################
  ###### VIM for Factors
  ###############################################################################
  ###############################################################################
  output <- lapply(1:xc, function(i) {
    nameA=names.fac[i]
    cat(" i = ",i,"Var = ",nameA, " out of ",xc," factor variables","\n")
    thetaV=NULL
    varICV = NULL
    labV=NULL
    EY0V=NULL
    EY1V=NULL
    nV=NULL
    for(kk in 1:V) {
#      cat(" i = ",i," V = ",kk,"\n")
      At=data.fac[folds!=kk,i]
      Av=data.fac[folds==kk,i]
      Yt=Y[folds != kk]
      Yv=Y[folds == kk]
      ### acit.numW is just same as acit.cont.dist except with NA's replaced by 0's.
      mtch1=match(vars.facW,nameA)
      mtch2=match(vars.mfacW,nameA)
      Adum=data.frame(datafac.dum[,is.na(mtch1)==F])
      dumW=datafac.dum[,is.na(mtch1)]
      missdumW=miss.fac[,is.na(mtch2)]
      if(is.null(missdumW)){missdumW=rep(NA,n.fac)}
      if(is.null(dumW)){dumW=rep(NA,n.fac)}
      W=data.frame(dumW,missdumW)
      W=W[, !apply(is.na(W), 2, all),drop=F]
      Wt=W[folds!=kk,,drop=F]
      Wv=W[folds==kk,,drop=F]
      Adum=data.frame(Adum[folds !=kk,])
      ### Pull out any variables that are overly correlated with At (corr coef < corthes)
      corAt=apply(cor(Adum,Wt,use="complete.obs"),2,max.2)
#      cat("i = ",i," maxCor = ",max(corAt,na.rm=T),"\n")
      incc= abs(corAt) < corthres & is.na(corAt)==F
      Wv=Wv[,incc,drop=F]
      Wt=Wt[,incc,drop=F]
      nw=dim(Wv)[2]
      ### Skip of number covariates < 10
if(nw <=10) {
  Wtsht=Wt
  Wvsht=Wv
}
if(nw > 10) {
  mydist<-as.matrix(distancematrix(t(Wt),d="cosangle",na.rm=T))
  hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="mean",verbose=FALSE),silent=TRUE)
  if(class(hopach.1)=="try-error"){
    hopach.1<-try(hopach(t(Wt),dmat=mydist,mss="med",verbose=FALSE),silent=TRUE)
  }
  if(class(hopach.1)=="try-error"){
    Wtsht=Wt
    Wvsht=Wv
  }
  if(class(hopach.1) != "try-error") {
    nlvls=nchar(max(hopach.1$final$labels))
    no<-trunc(mean(log10(hopach.1$final$labels)))
    ### Find highest level of tree where minimum number of covariates is > ncov
    lvl = 1:nlvls
    ncv=NULL
    for(ii in lvl) {ncv=c(ncv,length(unique(trunc(hopach.1$final$labels/10^(no-(ii-1))))))}
    ncv=unique(ncv)
    lev = min(min(nlvls,dim(Wt)[2]),min(lvl[ncv>=ncov]))
    two.clust<-unique(trunc(hopach.1$final$labels/(10^(no-(lev-1)))))
    md<-hopach.1$final$medoids
    mm=md[,1]%in%two.clust
    incc=md[mm,2]
    Wtsht=Wt[,incc]
    Wvsht=Wv[,incc]
  }
}

      deltat=as.numeric(is.na(Yt)==F & is.na(At)==F)
      deltav=as.numeric(is.na(Yv)==F & is.na(Av)==F)
      levA=levels(At)
      minc=apply(table(Av,Yv),1,min)
      minc2=apply(table(At,Yt),1,min)
      vals=levA[pmin(minc,minc2)>minCell]
      num.cat=length(vals)    
      nYt=sum(Yt[is.na(At)==FALSE])
      nYv=sum(Yv[is.na(Av)==F])
      ## Don't do if 1) no more than one category of A left or
      ## if missingness pattern for A is such that there are
      ## few death events left in either (< minYs)
      if(num.cat < 2 | min(nYt,nYv)<minYs) {
        thetaV=c(thetaV,NA)
        varICV = c(varICV,NA)
        labV=rbind(labV,c(NA,NA))
        EY0V=c(EY0V,NA)
        EY1V=c(EY1V,NA)
        nV=c(nV,NA)
      }
      if(num.cat >= 2 & min(nYt,nYv) >= minYs) {  
        labmin=NULL
        labmax=NULL
        maxEY1=-100000
        minEY1=1000000
        minj=0
        maxj=0
        ICmin=NULL
        ICmax=NULL 
        for(j in 1:num.cat) {
          cat(" j = ",j,"\n")
          IA = as.numeric(At==vals[j])
          IA[is.na(IA)]=0
          res = get.tmle.est(Yt,IA,Wtsht,deltat,Q.lib=Q.library,g.lib=g.library)
          EY1=res$theta
          if(EY1 < minEY1){
            minj = j
            minEY1=EY1
            labmin=vals[j]}
          if(EY1 > maxEY1){
            maxj = j
            maxEY1=EY1
            labmax=vals[j]
          }    
        }  
        ##### Now, estimate on validation sample
        IA = as.numeric(Av==vals[minj])
        IA[is.na(IA)]=0
        res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
        IC0=res$IC
        EY0=res$theta
        IA = as.numeric(Av==vals[maxj])
        IA[is.na(IA)]=0
        res = get.tmle.est(Yv,IA,Wvsht,deltav,Q.lib=Q.library,g.lib=g.library)
        IC1=res$IC
        EY1=res$theta
        thetaV=c(thetaV,EY1-EY0)
        varICV = c(varICV,var(IC1-IC0))
        labV=rbind(labV,c(labmin,labmax))
        EY0V=c(EY0V,EY0)
        EY1V=c(EY1V,EY1)
        nV=c(nV,length(Yv)) 
      }
    }
    list(EY1V,EY0V,thetaV,varICV,labV,nV)
    #print(data.frame(EY1V,EY0V,thetaV,varICV,labV,nV))
  }
  )
  ###############################################################################
  #################################  Repeat for Continuous Explanatory Variables
  ###############################################################################
  
  vars=colnames(data.fac)
#  nc=length(vars.cont)
  nf=length(vars)
  factr=rep("factor",nf)
  out.put=output
  names(out.put)=vars
  lngth=sapply(out.put,function(x) length(x))
  out.put=out.put[lngth==6]
  vars=vars[lngth==6]
  factr=factr[lngth==6]
  lngth2=sapply(out.put,function(x) length(na.omit(x[[1]])))
  out.put=out.put[lngth2==V]
  vars=vars[lngth2==V]
  factr=factr[lngth2==V]

  tst=lapply(out.put, function(x) x[[3]])
  tst=do.call(rbind,tst)
  tot.na=function(x){sum(is.na(x))}
  xx=apply(tst,1,tot.na)
  out.sht=out.put[xx==0]
  vars=vars[xx==0]
  factr=factr[xx==0]
  
  #names(out.sht)=vars[xx==0]
  tst=lapply(out.sht, function(x) x[[1]])
  EY1=do.call(rbind,tst)
  tst=lapply(out.sht, function(x) x[[2]])
  EY0=do.call(rbind,tst)
  tst=lapply(out.sht, function(x) x[[3]])
  theta=do.call(rbind,tst)
  tst=lapply(out.sht, function(x) x[[4]])
  varIC=do.call(rbind,tst)
  tst=lapply(out.sht, function(x) x[[6]])
  nV=do.call(rbind,tst)
  n=sum(nV[1,])
  SEV = sqrt(varIC/nV)
  ##### Get labels for each of the training sample
  labs.get=function(x,fold) {
    lbel=rep(1:2,fold)
    oo=order(lbel)
    lbel=lbel[oo]
    out=as.vector(t(x))
    names(out)=paste("v.",lbel,rep(c("a_L","a_H"),2),sep="")
    out}
  tst=lapply(out.sht, function(x) x[[5]])
  tst=lapply(tst,labs.get,fold=2)
  lbs=do.call(rbind,tst)
  
  
  meanvarIC=apply(varIC,1,mean)
  psi=apply(theta,1,mean)
  SE=sqrt(meanvarIC/n)
  lower=psi-1.96*SE
  upper=psi+1.96*SE
  signdig=3
  CI95=paste("(",signif(lower,signdig)," - ",signif(upper,signdig),")",sep="")
  # 1-sided p-value
  pvalue=1-pnorm(psi/SE)
  #####  FOR THETA (generalize to chi-square test?)
  TT = (theta[,1]-theta[,2])/sqrt(SEV[,1]^2+SEV[,2]^2)
  pval.comp=2*(1-pnorm(abs(TT)))
  #####  FOR levels (just make sure in same order)
  nc=sum(factr=="ordered")
  n=length(factr)
  #### Factors
  lwr=NULL
  uwr=NULL
  for(i in 1:V) {
    lwr=cbind(lwr,lbs[(nc+1):n,i*2-1])
    uwr=cbind(uwr,lbs[(nc+1):n,i*2-1])
  }
  length.uniq=function(x){length(unique(x))}
  conslwr=apply(lwr,1,length.uniq)
  consupr=apply(uwr,1,length.uniq)
  cons=conslwr*consupr
  consist= (cons==1 & pval.comp > 0.05)
  
  
  procs<-c("Holm","BH")
  res<-mt.rawp2adjp(pvalue,procs)
  oo<-res$index
  outres=data.frame(factor=factr[oo],theta[oo,],psi[oo],
                    CI95[oo],res$adj,lbs[oo,],consist[oo])
  
  outres=outres[is.na(outres[,"rawp"]) ==F,]
  names(outres)[1:5]=c("VarType","psiV1","psiV2","AvePsi","CI95")
  names(outres)[13]="Consistent"
  
  ### Get Consistency Measure and only significant
  outres.cons=outres[outres[,"Consistent"],-13]
  outres.cons=outres.cons[outres.cons[,"BH"]<0.05,]
  
  
  #drops = c("VarType","description","Holm,")
  #outres.all=outres[,!(names(outres) %in% drops)]
  outres.byV=outres[,c(2,3,9:12)]
  print(xtable(outres.byV,caption="data Variable Importance Results By Estimation Sample",label="byV",digits=4),type="latex",file=paste(dirout,"byV.tex",sep=""), caption.placement="top",include.rownames=T)
  
  outres.all=outres[,c(4,5,6,8)]
  print(xtable(outres.all, caption="data Variable Importance Results for Combined Estimates",label="allRes",digits=4),type="latex",file=paste(dirout,"AllReslts.tex",sep=""),caption.placement="top",include.rownames=T)
  
  
  print(xtable(outres.cons[,c(4,5)],caption="Subset of of Significant and ``Consistent'' Results",label="consisRes",digits=4),type="latex",file=paste(dirout,"ConsistReslts.tex",sep=""),caption.placement="top",include.rownames=T) 
}
}
ahubb40/varImpact documentation built on May 12, 2019, 2:31 a.m.