R/glm_approx.R

Defines functions glm.approx compute.glm compute.lm glm.coef compute.dispersion wls.mb wls.coef compute.approx.z add.counts extract.sf bintest safe.quasibinomial.glm.fit vss vs v3

Documented in bintest glm.approx safe.quasibinomial.glm.fit v3 vs vss

#' Compute approximately unbiased variance estimates for the estimators for logit(p) when n is small


v3=function(n,s,f){
    return((n+1)/n*(1/(s+1)+1/(f+1)))
}

#' Compute approximately unbiased variance estimates for the estimators for logit(p) when n is small


vs=function(n,s,f){
    vv=v3(n,s,f)
    return(vv*(1-2/n+vv/2))
}

#' Compute approximately unbiased variance estimates for the estimators for logit(p) when n is small


vss=function(n,s,f){
    vv=v3(n,s,f)
    return(vs(n,s,f)-1/2*vv^2*(vv-4/n))
}


#' Modified glm function to return relevant outputs, not allowing for underdispersion


safe.quasibinomial.glm.fit=function(x,y,forcebin=FALSE,repara=FALSE,...){
    if(forcebin){
        z=glm.fit(x,y,family=binomial(),...)
        p1=1L:z$rank
        covmat=chol2inv(z$qr[[1]][p1, p1, drop = FALSE])
        se=sqrt(diag(covmat))
        if(repara==TRUE){
            if(length(covmat)<=1){
                covmubeta=NA
            }else{
                covmubeta=covmat[2,1]
            }
            mbvar=covmubeta/se[2]^2
            z$coef[1]=z$coef[1]-z$coef[2]*mbvar
            se[1]=sqrt(se[1]^2+mbvar^2*se[2]^2-2*mbvar*covmubeta)
        }
    }else{
        z = glm.fit(x,y,family=quasibinomial(),...)
        p1=1L:z$rank
        covmat.i=chol2inv(z$qr[[1]][p1, p1, drop = FALSE])
        df=z$df.residual
        if(df==0){
            d=1
        }else{
            d=sum((z$weights * z$residuals^2)[z$weights > 0])/df
            d=d*(d>=1)+1*(d<1)
        }
        covmat=covmat.i*d
        se=sqrt(diag(covmat))
        if(repara==TRUE){
            if(length(covmat)<=1){
                covmubeta=NA
            }else{
                covmubeta=covmat[2,1]
            }
            mbvar=covmubeta/se[2]^2
            z$coef[1]=z$coef[1]-z$coef[2]*mbvar
            se[1]=sqrt(se[1]^2+mbvar^2*se[2]^2-2*mbvar*covmubeta)
        }
    }
    if(repara==FALSE){
        return(c(z$coef[1],se[1],z$coef[2],se[2]))
    }else{
        return(c(z$coef[1],se[1],z$coef[2],se[2],mbvar)) 
    }
}



#' Returns estimates of intercept and slope as well as their SEs, given other input options. Called in glm.approx


bintest = function(x,g,minobs=1,pseudocounts=0.5,all=FALSE,forcebin=FALSE,repara=FALSE){
    xmat = matrix(x,ncol=2)
    zerosum = (apply(xmat,1,sum)==0)
    if(sum(!zerosum)>(minobs-1)){   #check for enough observations
        ind1=(xmat[,1]==0)
        ind2=(xmat[,2]==0)
        if(all==TRUE){
            xmat = xmat[!zerosum,,drop=F]+pseudocounts
        }else{
            xmat[ind1,1]=xmat[ind1,1]+pseudocounts
            xmat[ind1,2]=xmat[ind1,2]+pseudocounts
            xmat[ind2,1]=xmat[ind2,1]+pseudocounts
            xmat[ind2,2]=xmat[ind2,2]+pseudocounts
            xmat = xmat[!zerosum,,drop=F]
        }
        #check if there is enough variance in g among informative individuals
        g=g[!zerosum]
        ng=sum(!zerosum)
        dm=matrix(c(rep(1,ng),g),ncol=2)
        if(!is.na(sd(g))&(sd(g) > 0.1)){
            dm[,2]=g
            return(safe.quasibinomial.glm.fit(dm,xmat,forcebin,repara=repara))
        }else {
            if(repara==FALSE){
                return(c(safe.quasibinomial.glm.fit(dm,xmat,forcebin,repara=repara)[1:2],NA,NA))
            }else{
                return(c(safe.quasibinomial.glm.fit(dm,xmat,forcebin,repara=repara)[1:2],NA,NA,NA))
            }
        }     
    }else{ #not enough observations, so just return NAs
        if(repara==FALSE){
            return(c(NA,NA,NA,NA))
        }else{
            return(c(NA,NA,NA,NA,NA))
        }
    }
}


extract.sf=function(x,n){
    return(list(x.s=as.vector(t(x[,(1:(2*n))%%2==1])),x.f=as.vector(t(x[,(1:(2*n))%%2==0]))))
}

add.counts=function(x.s,x.f,eps,pseudocounts,all,index1,index2,indexn=NULL){
    if(pseudocounts==0){                    
        x.s[index1]=x.s[index1]+eps
        x.f[index2]=x.f[index2]+eps
    }else if(pseudocounts!=0&all==TRUE){
        x.s=x.s+pseudocounts
        x.f=x.f+pseudocounts
    }else{
        x.s[index1]=x.s[index1]+pseudocounts
        x.f[index1]=x.f[index1]+pseudocounts
        x.s[index2]=x.s[index2]+pseudocounts
        x.f[index2]=x.f[index2]+pseudocounts
    }
    if(!is.null(indexn)){
        x.s[indexn]=0
        x.f[indexn]=0 
    }
    return(list(x.s=x.s,x.f=x.f))
}

compute.approx.z=function(x.s,x.f,bound,eps,pseudocounts,all,indexn=NULL,return.p=FALSE){
    #compute mu
    index1=(x.s/x.f)<=bound                  #find indices for which binomial success or failures are too small
    index2=(x.f/x.s)<=bound
    index1[is.na(index1)]=FALSE
    index2[is.na(index2)]=FALSE              #this is the same as above!!!
    x=add.counts(x.s,x.f,eps,pseudocounts,all,index1,index2,indexn)       #add pseudocounts
    s=x$x.s+x$x.f 
    mu=log(x$x.s/x$x.f)                          #compute logit(p) to be used as observations
    mu[index1]=mu[index1]-0.5                 #end-point correction
    mu[index2]=mu[index2]+0.5
    #compute var
    if(all==FALSE){                         #compute var(logit(p))
        var=vss(s,x$x.s,x$x.f)
        var[index1]=vss(s[index1]-2*pseudocounts,x$x.s[index1]-pseudocounts,x$x.f[index1]-pseudocounts)
        var[index2]=vss(s[index2]-2*pseudocounts,x$x.s[index2]-pseudocounts,x$x.f[index2]-pseudocounts)
    }else{
        var=vss(s-2*pseudocounts,x$x.s-pseudocounts,x$x.f-pseudocounts)   
    }
    if(return.p==TRUE)
        return(list(mu=mu,var=var,p=x$x.s/s))
    else
        return(list(mu=mu,var=var)) 
}

wls.coef=function(z,disp,indexnm,n,ng,forcebin,g=NULL,repara=NULL){
    #compute vector of dfs for all n linear models (disregarding obs with missing data)
    if(is.null(g))
        df=pmax(colSums(!indexnm)-1,0)
    else
        df=pmax(colSums(!indexnm)-2,0)
    zm=matrix(z$mu,ncol=n,byrow=T)             #create ng*n matrix of logit(p)
    zm[indexnm]=0
    vm=matrix(z$var,ncol=n,byrow=T)             #create ng*n matrix of var(logit(p))
    res=wls.mb(zm,vm,disp,indexnm,ng,df,forcebin,g,n)
    if(disp=="add"){                        #return estimates if multiplicative dispersion is assumed
        vm[indexnm]=NA
        vv=pmax((res$wrse2-1)*colMeans(vm,na.rm=T),0)   #computes crude estimate of sigma_u^2 as in documentation
        res=wls.mb(zm,vm,disp,indexnm,ng,df,forcebin,g,n,vv)
    }
    if(is.null(g))
        return(list(coef=res$coef,se=res$se,mbvar=NULL))
    else{
        coef=c(res$muhat,res$betahat)
        se=c(res$semuhat,res$sebetahat)
        if(repara==TRUE){                            #return reparametrized muhat and behat as well as their SEs, together with gamma as defined in documentation
            mbvar=res$covmubeta/res$sebetahat^2
            coef[1:n]=res$muhat-res$betahat*mbvar
            se[1:n]=sqrt(res$semuhat^2+mbvar^2*res$sebetahat^2-2*mbvar*res$covmubeta)
        }else{
            if(repara==FALSE)
                mbvar=NULL
            else
                stop("Error: invalid argument 'repara'")
        }
        return(list(coef=coef,se=se,mbvar=mbvar))  
    }
}

wls.mb=function(z,v,disp,indexnm,ng,df,forcebin,g=NULL,n=NULL,vv=NULL){
    if(is.null(vv)){                       #compute weights for each of the n models
        w=1/v                                
    }else{
        w=1/(v+rep(1,ng)%o%vv) 
    }
    w[indexnm]=0
    ws=colSums(w)                         #define sum of weights for each of the n models (to be used later)
    if (is.null(g)){
        muhat=colSums(w*z)/ws                 #compute muhat for each of the n models using formula in documentation
        wrse=sqrt(colSums((z-rep(1,ng)%o%muhat)^2*w)/df)  #compute residual standard error
    }else{
	gwmean=colSums(w*g)/ws                #define weighted center of g for each of the n models (to be used later)
	ggwmeanm=g%o%rep(1,n)-rep(1,ng)%o%gwmean                    #define weighted difference between each g and its weighted center for each of the n models (to be used later)
        wgg=colSums(w*ggwmeanm^2)                     #compute sum_j w_j^2*(g_j-gwmean)^2 (to be used later)
        wgg.ind=wgg<1e-6                               
        wgg[wgg.ind]=0
        betahat=colSums(w*z*ggwmeanm)/colSums(w*ggwmeanm^2)         #compute betahat using formula in documentation
	g.betahat=g%o%betahat
                                        #compute betahat*g  for each of the n models
        muhat=colSums(w*(z-g.betahat))/ws           #compute muhat using formula in documentation
        wrse=sqrt(colSums((z-rep(1,ng)%o%muhat-g.betahat)^2*w)/df)   #compute residual standard error
    }
    wrse[is.na(wrse)]=1 
    if(forcebin|(is.null(g)&ng==2)|(!is.null(g)&ng==length(unique(g))))  #force dispersion to be absent (also in the case with only 1 observation in each group)
        wrse=1
    else{
        if(is.null(vv)){
            wrse[(wrse==Inf)|(wrse<1)]=1          #do not allow for "underdispersion"
        }else{
            wrse[(wrse==Inf)|(vv==0)]=1
        }
    }
    wrse2=wrse^2
    if(is.null(g)){
        semuhat=sqrt(wrse2/ws)
        return(list(coef=muhat,se=semuhat,wrse2=wrse2))
    }else{
	sebetahat=sqrt(wrse2/wgg)                      #compute se(betahat) using formula in documentation
        sebetahat[wgg.ind]=NA 
        semuhat=sqrt((1/ws+gwmean^2/wgg)*wrse2)       #compute se(muhat) using formula in documentation
        semuhat[wgg.ind]=NA
        covmubeta=colSums(w*ggwmeanm)/ws/wgg*wrse2-gwmean*sebetahat^2   #compute covariance between muhat and betahat
        return(list(muhat=muhat,semuhat=semuhat,betahat=betahat,sebetahat=sebetahat,covmubeta=covmubeta,wrse2=wrse2))
    }
}

compute.dispersion=function(p,n,ng,indexnm,forcebin,ind=NULL,ord=NULL,lg=NULL,x=NULL,x.s=NULL,x.f=NULL){
   if(is.null(lg)){
     if(forcebin|ng==1)                         #force dispersion to be absent
       return(1)
   }else{
     if(forcebin|(ng==lg))                   #(or if there is 1 obs in each group)
       return(1)
   }

    #find effective number of observations after getting rid of missing data
    ngn=!indexnm
    ngn=colSums(ngn)
    ngn=rep(ngn,times=ng)
    
    if(is.null(lg)){
        ss=x.s+x.f
        p=rep(p,times=ng)     
        d.ini=1/(ngn-1)*(x.s-p*ss)^2/(ss*p*(1-p)) #compute dispersion factor as in McC and Nelder
        d.ini[d.ini==Inf]=0
        d.ini[is.na(d.ini)]=0         
        d.m=matrix(d.ini,ncol=ng)
        d=rowSums(d.m)            
        d[d<1]=1                                  #do not allow underdispersion
    }else{
        x=x[ord,]
        x.sf=extract.sf(x,n)
        s=x.sf$x.s+x.sf$x.f
        pn=NULL
        for(i in 1:lg){    
            pn=c(pn,rep(p[(n*(i-1)+1):(n*i)],times=sum(ind[[i]])))
        }        
        d.ini=1/(ngn-lg)*(x.sf$x.s-pn*s)^2/(s*pn*(1-pn))   #compute dispersion factor as in McC and Nelder
        d.ini[d.ini==Inf]=0
        d.ini[is.na(d.ini)]=0         
        d.m=matrix(d.ini,ncol=ng)
        d=rowSums(d.m)
        d[d<1]=1                                      #do not allow underdispersion
        d=rep(d,times=lg)
    }
    return(d)
}




glm.coef=function(z,g,n,center,repara){
    lg=length(levels(g))
    mbvar=NULL
    if(lg==2){                              #2 categories
        covv=NULL
        if(center==TRUE){                         #considered centered and uncentered covariate separately    
            g.num=sort(as.numeric(levels(g))[g])
            g.num=unique(g.num-mean(g.num))
            w1=g.num[1]                           #weights come in because covariate is centered
            w2=g.num[2]
            coef=w2*z$mu[1:n]-w1*z$mu[(n+1):(2*n)]    #compute logit(p) for each group
            coef=c(coef,z$mu[(n+1):(2*n)]-z$mu[1:n])  #compute intercept and slope
            var=w2^2*z$var[1:n]+w1^2*z$var[(n+1):(2*n)]   #compute var(logit(p)) for each group
            var=c(var,z$var[(n+1):(2*n)]+z$var[1:n])       #compute var for intercept and slope
        }else{
            coef=z$mu-c(rep(0,n),rep(z$mu[1:n],times=(lg-1)))                      #compute intercept and slope
            var=z$var+c(rep(0,n),rep(z$var[1:n],times=(lg-1)))        #compute var of intercept and slope
        }
        if(repara==TRUE){
            mbvar=-var[1:n]/var[(n+1):(2*n)]                               #compute gamma as in documentation if reparametrization is used
            coef[1:n]=coef[1:n]-coef[(n+1):(2*n)]*mbvar                      #reparametrized estimates
            var[1:n]=var[1:n]-var[1:n]^2/var[(n+1):(2*n)]              #reparametrized Ses
        }
    }else if(lg==3){                        #3 groups case as in PoissonBinomial_etc
        if(center==TRUE){                         #considered centered and uncentered covariate separately    
            g.num=sort(as.numeric(levels(g))[g])
            g.num[g.num!=1]=0
            g.num[g.num==1]=1
            g.num=unique(g.num-mean(g.num))
            w1.1=g.num[1]
            w1.2=g.num[2]
            g.num=sort(as.numeric(levels(g))[g])
            g.num[g.num!=2]=0
            g.num[g.num==2]=1
            g.num=unique(g.num-mean(g.num))
            w2.1=g.num[1]
            w2.2=g.num[2]
            coef=(w1.2+w2.1)*z$mu[1:n]-w1.1*z$mu[(n+1):(2*n)]-w2.1*z$mu[(2*n+1):(3*n)]
            coef=c(coef,z$mu[(n+1):(3*n)]-rep(z$mu[1:n],times=(lg-1)))
            var=(w1.2+w2.1)^2*z$var[1:n]+(w1.1)^2*z$var[(n+1):(2*n)]+(w2.1)^2*z$var[(2*n+1):(3*n)]
            var=c(var,z$var[(n+1):(3*n)]+rep(z$var[1:n],times=(lg-1)))
            covv=z$var[1:n]
        }else{                       #3 groups case
            coef=z$mu-c(rep(0,n),z$mu[1:(2*n)])
            var=z$var+c(rep(0,n),z$var[1:(2*n)])
            covv=-z$var[(n+1):(2*n)]
        }
    }
    return(list(mu=coef,var=var,mbvar=mbvar,covv=covv))
}

compute.lm=function(g,coef,se,mbvar,n,index,repara){
    if(is.null(g)){
        na.ind=is.na(coef[1:n])|is.na(se[1:n])  
        coef[na.ind|index]=NA  #set muhat and se(muhat) to NA for those models with insufficiant data or NAs
        se[na.ind|index]=NA
        return(matrix(c(coef,se),nrow=2,byrow=T))                
    }else{
        index=rep(index,times=2)
        na.ind=is.na(coef[1:n])|is.na(se[1:n])|is.na(coef[(n+1):(2*n)])|is.na(se[(n+1):(2*n)])
        na.ind2=rep(na.ind,times=2)
        coef[na.ind2|index]=NA  #set muhat and se(muhat) to NA for those models with insufficiant data or NAs
        se[na.ind2|index]=NA
        toreturn=array(rbind(coef,se),dim=c(2,n,2))
        if(repara==TRUE){
            mbvar[index[1:n]]=NA
            mbvar[na.ind]=NA
            return(matrix(rbind(apply(toreturn,2,rbind),mbvar),ncol=n))
        }else
            return(matrix(rbind(apply(toreturn,2,rbind),mbvar),ncol=n)) #should this be return(apply(toreturn,2,rbind))????
    }
}

compute.glm=function(x,g,d,n,na.index,repara){
    se=sqrt(x$var*d)           #dispersion
    mu=x$mu
    mu[na.index]=NA          #set muhat and se(muhat) to NA for those models with insufficiant data
    se[na.index]=NA
    
    if(is.null(g))
        return(matrix(c(mu,se),nrow=2,byrow=T))
    else{
        if(is.factor(g)){
            lg=length(levels(g))
            toreturn=array(rbind(mu,se),dim=c(2,n,lg))
            if(lg==3){
                if(length(d)==1)
                    covv=x$covv*d   #dispersion
                else
                    covv=x$covv*d[1:n]   #dispersion
                covv[na.index[1:n]]=NA   #check that this is correct?????
                return(matrix(rbind(apply(toreturn,2,rbind),covv),ncol=n)) 
            }else if(lg==2){
                if(repara==FALSE)
                    return(apply(toreturn,2,rbind))
                else{
                    mbvar=x$mbvar
                    mbvar[na.index[1:n]]=NA
                    return(matrix(rbind(apply(toreturn,2,rbind),mbvar),ncol=n))
                }
            }
        }
    }
}



#' Fit the model specified in documentation, using either a weighted least squares approach or a generalized linear model approach, with some modifications.
#'
#' This function is optimized for fitting multiple models simultaneously

glm.approx=function(x,g=NULL,minobs=1,pseudocounts=0.5,all=FALSE,eps=1e-8,center=FALSE,repara=FALSE,forcebin=FALSE,lm.approx=FALSE,disp=c("add","mult"),bound=0.02){
    disp=match.arg(disp)
    if(is.vector(x)){dim(x)<- c(1,length(x))}  #if x is a vector convert to matrix 
    n=ncol(x)/2
    ng=nrow(x)
    
    if(ng==1){
        lm.approx=FALSE             #if x has 1 row don't use the lm approximation
        forcebin=TRUE 
    }else{
        x.sf=extract.sf(x,n)                      #extract success and failure counts separately
        indexn=(x.sf$x.s==0&x.sf$x.f==0)                  #find indices for which there is no data
        indexnm=matrix(indexn,nrow=ng,byrow=T)
    }
    if(lm.approx==TRUE){                        #use WLS approximation
        na.index=colSums(matrix((x.sf$x.s+x.sf$x.f)!=0,ncol=n,byrow=T))<minobs    #find indices for which there is insufficient data
        z=compute.approx.z(x.sf$x.s,x.sf$x.f,bound,eps,pseudocounts,all,indexn)          #obtain estimates for logit(p) and var(logit(p))
        if(is.null(g)){                           #smoothing multiple signals without covariate  
            res=wls.coef(z,disp,indexnm,n,ng,forcebin)
            return(compute.lm(g,res$coef,res$se,res$mbvar,n,na.index,repara))   
        }else{                                           
            if(is.factor(g)) g=as.numeric(levels(g))[g]   #if g is a 2-level factor convert to numeric
            if(center==TRUE) g=g-mean(g)
            res=wls.coef(z,disp,indexnm,n,ng,forcebin,g,repara)
            return(compute.lm(g,res$coef,res$se,res$mbvar,n,na.index,repara))
        }
    }else{                                          #use GLM     
        if(is.null(g)){                               #case where g is absent
            if(ng>1)                   #smoothing multiple signals
                x=colSums(x)                              #pool data together as in GLM
            x=matrix(x,ncol=n)
            na.index=(colSums(x)==0)
            z=compute.approx.z(x[1,],x[2,],bound,eps,pseudocounts,all,NULL,TRUE)          #obtain estimates for logit(p) and var(logit(p)) #indexn=NULL???
            d=compute.dispersion(z$p,n,ng,indexnm,forcebin,x.s=x.sf$x.s,x.f=x.sf$x.f)      #computes dispersion  
            return(compute.glm(z,g,d,n,na.index,repara))
        }else{                                        #case where g is present
            if(is.factor(g)){                           #first consider case where g is factor, and hence with closed form solution
                lg=length(levels(g))
                ord=sort.list(g)
                if(ng>lg){                                #pool successes and failures in the same category, depending on if there are more obs than no. of categories or not
                    x.mer=matrix(0,nrow=lg,ncol=2*n)
                    ind=list(0)
                    for(i in 1:lg){
                        ind[[i]]=(g==levels(g)[i])
                        x.mer[i,]=colSums(matrix(x[ind[[i]],],nrow=sum(ind[[i]])))   
                    } 
                }else{
                    ind=NULL
                    x.mer=x[ord,]
                }		  
                x.mer.s=x.mer[,(1:(2*n))%%2==1]           #now consider pooled data as effective raw data and extract successes and failures
                x.mer.f=x.mer[,(1:(2*n))%%2==0]
                na.index=colSums(matrix((x.mer.s+x.mer.f)!=0,ncol=n))<pmin(minobs,lg)     #find indices with insufficient data #since data are pooled, min no. of obs cannot be less than number of groups in g
                na.index=rep(na.index,times=lg)
                x.s.m=as.vector(t(x.mer.s))               #extract successes and failures from pooled data
                x.f.m=as.vector(t(x.mer.f))
                indexn=(x.s.m==0&x.f.m==0)                #define indices where pooled data still has no data
                z=compute.approx.z(x.s.m,x.f.m,bound,eps,pseudocounts,all,indexn,TRUE)          #obtain estimates for logit(p) and var(logit(p)) #indexn?
                res=glm.coef(z,g,n,center,repara)
                d=compute.dispersion(z$p,n,ng,indexnm,forcebin,ind,ord,lg,x)   #computes dispersion
                return(compute.glm(res,g,d,n,na.index,repara))
            }else{                                    #now consider the case when g is quantitative
                x=matrix(x,ncol=n)
                if(center==TRUE) g=g-mean(g)
                #use the bintest function to fit a GLM separately to each case for quantitative covariate
                return(apply(x,2,bintest,g=g,minobs=minobs,pseudocounts=pseudocounts,all=all,forcebin=forcebin,repara=repara))
            }
        }
    }
}  
shimlab/HMTree documentation built on May 29, 2019, 9:25 p.m.