R/MultipleTesting_HeteroDist_ModFuncs.r

Defines functions storeyPi0Est FDRAdjustedPval BHFDRApp GenEstProp pvalOneSideBTSupport pvalOneSideFETSupport pvalueByBinoSupport pvalFETSupport getcellcountsandmarginals_DE getcellcountsandmarginals fulltable pvalueDist Div_Appr_Unif GetDivergenceMatrix eNetFull eNetBuilder CheckIt

####################################################################
###########  function: check basic numeric properties  ############
####################################################################
# small function
CheckIt = function(vector_in) {
   tmp1 = any(is.na(vector_in) | is.nan(vector_in) | is.infinite(vector_in))
   cat("^^any na or nan:",tmp1,"...")
   
   if(is.list(vector_in)) stop("groupwise data is list","\t")
   
   if (is.vector(vector_in)) {
   cat("Min is:",min(vector_in)," Max is:", max(vector_in),"...")
   cat("length is",length(vector_in),"\t") }
   
   if (is.matrix(vector_in)) {
      nr = nrow(vector_in); nc = ncol(vector_in)
      cat("nrow:",nr,"; ncol:",nc,"\t")
     if (nc == 2) {
          rs1 =  rowSums(vector_in)
         cat("^^min rowSum:",min(rs1),"^^Max rowSum:",max(rs1),"\n") 
         }
      if (nc == 4) {
          rs1 =  rowSums(vector_in[,1:2]) # For fet based on individual marginals
         cat("^^min rowSum:",min(rs1),"^^Max rowSum:",max(rs1),"\n") 
         }    
     }   # third big if
   }  
 

###########################################################################
#### function: updated main grouping with specified minimal group size 
#######################################################################
### function
# FOR PRACTICAL PURPOSED to_merge is always true.

eNetBuilder = function(div_mat = NULL,ngp = NULL, merge_size = NULL,rad = NULL)
  {
  if (!is.matrix(div_mat))
    stop("^^ A matrix with entries as divergences is needed ...","\n")
  
  m_lt = nrow(div_mat)

  ngrptmp = 1; cnt_idx = double(0);  div_mat_in = div_mat
  all_idx = 1:m_lt;  # caution, no longer all_idx = 1:m 
  
  idx_left = all_idx ; itn = length(idx_left)
  centers_id = double(0) ;    grp_idx = vector("list",ngp)

  while(itn & (ngrptmp <= ngp)) {
    # cat("^^Forming group", ngrptmp, "....","\n")
   ball = vector("list",itn);  ball_size = double(itn)

   # compare itn balls
   for (i in 1:itn) {
        ball_id_tmp = which(abs(div_mat_in[i,]) <= rad)
        ball[[i]] = idx_left[ball_id_tmp]    # ids in ball from original 1:nrow(div_mat) indices
        ball_size[i] = length(ball[[i]])    }

    smax_id = which.max(ball_size);  id_mxball = idx_left[smax_id]
    ball_max = ball[[smax_id]];  size_mxball = ball_size[smax_id]

   # cat("^^Ids: e-ball center", id_mxball,". Ball size:", size_mxball, "\n") # "ids in ball:",ball_max,"\n")
    
    centers_id = c(centers_id,id_mxball);     grp_idx[[ngrptmp]] = ball_max
    cnt_idx = c(cnt_idx,ball_max)
    # added check
   # cat("^^# of Id used",length(cnt_idx),"\n")
    
    if (any(cnt_idx <0))     stop("^^negative id in current ids","\n") 
    
    idx_left = all_idx[-cnt_idx]  # take current ids from 1:m

    itn = length(idx_left)
   # cat("^^# of ids left currently",itn,"\n") # ". Idx left:", idx_left,"\n")

    # Follwing is the best partition, case 1:  ngrptmp <= ngp-1
    if (ngrptmp <= ngp-1) {
       if (itn <= merge_size) {  
         # the above condition should be itn <= merge_size instead of itn < merge_size since it is when ngrptmp <= ngp-1
       #  cat("^^Can not form",ngp,"groups each with cardinality no less than",merge_size,"\n")
         cat("^^---- Regroup with smaller e-net size ---- ","\n")
         rad = rad/2  # adjust this
         idx_left = 1:m_lt; itn = length(idx_left)
         ngrptmp = 0;  cnt_idx = double(0);  div_mat_in = div_mat
         }

       if (itn == merge_size & ngrptmp == ngp-1) {
         cat("^^Merging",itn,"ids.",ngp,"e-balls reached as requested","\n")
         grp_idx[[ngp]] =  idx_left
         itn = 0
        }
       
       if (itn > merge_size) {
         # cat("^# of id's left to group is", itn,",continue grouping","\n")
         div_mat_in = div_mat[idx_left,][,idx_left]
         }
    }  # end  if (ngrptmp <= ngp-1)

    # case 2:  ngrptmp = ngp
    if (ngrptmp == ngp) {
      # nothing left 
      if (itn == 0)
        cat("^^",ngp,"e-balls reached as requested","\n")

      if (itn > merge_size) {
     #  cat("^^---Some ids left for more groups. Regroup with larger e-net size --- ","\n")
        rad = 1.5*rad  # adjust this
        idx_left = 1:m_lt; itn = length(idx_left)
         ngrptmp = 0;  cnt_idx = double(0);  div_mat_in = div_mat
       }  
       
       if (itn <= merge_size) {
           grp_idx[[ngp]] = c(grp_idx[[ngp]],idx_left)
          cat("^^Merging",itn,"ids.",ngp,"e-balls reached as requested","\n")
           }
      } # end if (ngrptmp == ngp)

    ngrptmp = ngrptmp + 1

  } # end of while
      return (grp_idx)
} # end of fcuntion

### ###########################
eNetFull = function(metrics_in = NULL, ngrp_in = NULL, merge_size = NULL, rad_in = NULL, mgpsize_in = NULL) {

    if(is.null(mgpsize_in))
     stop("^^Specify minimal group size (MGS)")
    rad_tmp = rad_in;  mgps_tmp = 1  # mininal number of groups and group size

    while (mgps_tmp < mgpsize_in) {

      groups_tmp = eNetBuilder(metrics_in,ngrp_in,merge_size,rad_tmp)
      ngp_tmp = length(groups_tmp)
  #    check minimal group size
      mgps_tmp = min(sapply(groups_tmp,length))
      cat("^^Current achieved minimal group size is", mgps_tmp,"\n")
    } # end of while
    return(groups_tmp)
}   # end of function
 


########################## ########################## ##########################  
################## function:  divergence  matrix   ########################## 
########################## ########################## ##########################

GetDivergenceMatrix = function(pvSpList=NULL)  {   
  
  lgA1 = length(pvSpList)     # caution: no longer m but lgA1
  lgt_div = (lgA1-1)*lgA1/2
  cat("^^Computing", lgt_div, "pairwise divergences. Takes time ...","\n")
  
  infNorm_mat = matrix(0,lgA1,lgA1); div_mat = matrix(0,lgA1,lgA1)
  
  # start of computing pairwise quantities
  for (i in 2:lgA1) { 
       sp_pv1 = pvSpList[[i]];  lg1 = length(sp_pv1)     #pvSpList has been formated
           
     for (j in 1:(i-1)) {       
       sp_pv2 = pvSpList[[j]]; lg2 = length(sp_pv2)    #pvSpList has been formated 
     
       teva = union(sp_pv1,sp_pv2)  # evaluate cdf at union of pvalue supports 
       cdfn1 = pvalueDist(teva,sp_pv1);  cdfn2 = pvalueDist(teva,sp_pv2)
       
       infNorm_mat[i,j] = max(abs(cdfn1-cdfn2)) 
        }
    }   # end of computing pairwise quantities
      
     infNorm_mat_sym = infNorm_mat + t(infNorm_mat)
      
      cat("^^Finished computing matrix of pairwise divergences...","\n")       
      return(infNorm_mat_sym)
      
  }
  
 ######################### ########################## ##########################   
### Identify Approximate Uniform, this function also formats different psupport
######################### ########################## ########################## 
 
  Div_Appr_Unif = function(pvSpList=NULL,test_used=NULL,appr_unif_tol = NULL) {
      lg3 = length(pvSpList)
      pv_supp_formatted = vector("list",lg3)
      id_appr_unif = double(0)
       
          for (i in 1:lg3) {             
             sp_pv_tmp = c(0,pvSpList[[i]])
             if (max(abs(diff(sp_pv_tmp))) < appr_unif_tol)      
              id_appr_unif = c(id_appr_unif,i)
          }  
                      
    return(list(pvSpList,id_appr_unif))       
  } # end of func
 
  
  
  #######################################################################
  ####   Distribution of the p-values  (Generally Applicable)  #########
  #######################################################################
  
  pvalueDist <- function(peva,support)
  {
    lgh=length(support)
    lgh3 <- length(peva)
    y <- double(lgh3)
    for (i in 1:lgh3)  {
      cpv <- peva[i] >= support
      # the key is to compare where t falls in the support
      if (sum(cpv)==0) {y[i]=0} # because t is strictly less than minimum of support
      else if (sum(cpv)==lgh) {y[i]=1} # because t is no less than maximum of support
      else # t falls in one interval
      {  y[i] <- support[max(which(cpv))]   }
    }
    return(y)
  }
  
  

       ######################################################################
      ####     Function to get full tables for  association studies #########
      #######################################################################
      fulltable <- function(twocoldata)
      {
        # m is the # rows in the data
        m <-nrow(twocoldata)
        # construct 3-3 full table
        tables<- array(0, dim=c(3,3,m))

        controlsum<-sum(twocoldata[,1])
        treatsum<-sum(twocoldata[,2])

         # get 2-by-2 table for each gene
          tables[1,1,]=twocoldata[,1]
          tables[1,2,]=twocoldata[,2]
          tables[2,1,]=controlsum - twocoldata[,1]
          tables[2,2,]= treatsum -twocoldata[,2]

         # get 3rd row and column for each gene
         tables[1,3,]= tables[1,1,]+ tables[1,2,]
         tables[2,3,]= tables[2,1,]+ tables[2,2,]

           tables[3,1,]=rep(controlsum,m)
           tables[3,2,]=rep(treatsum,m)
           tables[3,3,]=rep(treatsum+controlsum,m)

         return(tables)
      }


      ################################################################# ###########################
      #### Function to Get marginals and cellcounts from generated counts for association stuies
      ################################################################# ###########################

      getcellcountsandmarginals <- function(countveccontrol,countvectreat)
      {
        m <- length(countveccontrol)
        twocolsimdata <- cbind(countveccontrol,countvectreat)
        simtables <- fulltable(twocolsimdata)

        simallcellcounts <- vector("list",m)
        for (i in 1:m) {simallcellcounts[[i]] <- simtables[1:2,1:2,i]}

        simallmarginals <- vector("list",m)
        for (i in 1:m) {simallmarginals[[i]] <- c(simtables[1,3,i],simtables[2,3,i],simtables[3,1,i])}

         y2 <- list(simallcellcounts,simallmarginals)
          return(y2)
      }

      ############################################################# ###########################
      #### Function to Get marginals and cellcounts from generated counts for differential expression
      ################################################################# ###########################
      
      getcellcountsandmarginals_DE <- function(data_in)
      {
        m <-nrow(data_in)
        # construct 3-3 full table
        tables<- array(0, dim=c(3,3,m))
        
        tables[1,1,] = data_in[,1]
        tables[2,1,] = data_in[,2]
        tables[1,2,] = data_in[,3] - data_in[,1]
        tables[2,2,] = data_in[,4] - data_in[,2]
        tables[1,3,]= data_in[,3]
        tables[2,3,]= data_in[,4]
        tables[3,3,] = data_in[,3] + data_in[,4]
        tables[3,1,] = data_in[,1] + data_in[,2]
        tables[3,2,] = tables[3,3,] - tables[3,1,]
        
        simallcellcounts <- vector("list",m)
        for (i in 1:m) {simallcellcounts[[i]] <- tables[1:2,1:2,i]}
        
        simallmarginals <- vector("list",m)
        for (i in 1:m) {simallmarginals[[i]] <- c(tables[1,3,i],tables[2,3,i],tables[3,1,i])}
        
        y2 <- list(simallcellcounts,simallmarginals)
        return(y2)
      }
      
      
      
      
 ###################################################################################
  #### Function to get two-sided p-valu and its support for FET
  ###################################################################################

    pvalFETSupport <- function(cellmatrix)
      {

          ns = cellmatrix[1,1]; nw = sum(cellmatrix[,1]); nb = sum(cellmatrix[,2]); nd = sum(cellmatrix[1,])
#          x <- x[1, 1]; m <- sum(x[, 1]);    n <- sum(x[, 2]); k <- sum(x[1, ])
#        lo <- max(0, k - n);   hi <- min(k, m);       support <- lo:hi
#        logdc <- dhyper(support, m, n, k, log = TRUE)
          lo <- max(0, nd - nb);   hi <- min(nd, nw)
          support <- lo:hi
          # all probability masses for this distribution
          mass = dhyper(support, nw, nb, nd)
          # realized mass
          realizedmass = dhyper(ns, nw, nb, nd)

           # two-sided pvalue
           pvalue <- sum(mass[which(mass <= realizedmass)])
            # add: sum_y P(y) over y such that P(y) < P(X)
           lessProb = sum(mass[which(mass < realizedmass)])
           # eqProb: sum_y P(y) over y such that P(y) = P(x)
           eqProb = pvalue - lessProb
           
           if (is.na(pvalue)) print("pvalue is NaN")

          ### compute p value support
          lgh2 = length(mass)
          temp <- double(lgh2)
          for (i in 1:lgh2)
          {
            # temp[i] is the two-sided pvalue for i-th table given marginals
            temp[i] <- sum(mass[which(mass <= mass[i])])
          }
           # sort pvalue support
          psupport <- unique(sort(temp,decreasing=FALSE))

          # randomized p-value
        randPval = pvalue - mean(runif(1))*eqProb  # definition from Dickhaus et al Lemma 2
        randPval = min(1,randPval)
        
        # mid p-value
        MidPval = lessProb + 0.5*eqProb # definition from HWang et al 2001
        MidPval = min(1,MidPval)
          # return all three p-values
          pvals = c(pvalue,MidPval,randPval)
          
          # save probless to the fist entry
          support<- c(pvals, psupport)
          return(support)
          }


      ################################################################################### 
      ####  Function to get two-sided pvalue and its support for binomial test    #########
      ###################################################################################

      pvalueByBinoSupport <- function(marginal)
      {
        #  under null that two poisson rates are equal, the UMP test follows Binomial with p=0.5
        spTmp = seq(from=0,to=marginal[2])
        d <- dbinom(spTmp,marginal[2],0.5,TRUE) ### using log of probs provides more accurate values
        ord <- order(d)
        d.ordered <- d[ord]
        
        # when marginal[2] is odd, the null CDF is symmetric and each prob appears twice
        relDev <- abs(diff(d.ordered)) ### check differences
        idx <- which(relDev > 0 & relDev <= .Machine$double.eps*2) ### "2" has proven to be a good choice
        # slight numerical correction
        ler = length(idx)  
        if (ler >0) {  
          for (jx in 1:ler) {
            d.ordered[c(idx[jx], idx[jx] + 1)] <- sum(d.ordered[c(idx[jx], idx[jx] + 1)])/2        
          } } 
        
        d.ordered <- exp(d.ordered)
        s <- sum(d.ordered) # check total prob
        if(s != 1) d.ordered <- d.ordered/s     
        d <- d.ordered[order(ord)]              ### restore original order (if necessary)
        #### end of correction
        mass = d      # re-assign

          ## compute pvalue
          realizedmass <- mass[which(spTmp==marginal[1])]

           # two-sided pvalue
           pvalue <- sum(mass[which(mass <= realizedmass)])
           # add: sum of probabilities of y such that P(y) < P(X)
           lessProb = sum(mass[which(mass < realizedmass)])
           # eqProb: sum_y P(y) st P(y) = P(x)
           eqProb = pvalue - lessProb
           
           # check p-value
           if (is.na(pvalue)) print("pvalue is NaN")

          ### compute p value support
          lgh2 = length(mass)
          temp <- double(lgh2)
          for (i in 1:lgh2)
          {
            # temp[i] is the two-sided pvalue for i-th table given marginals
            temp[i] <- sum(mass[which(mass <= mass[i])])
          }
          # sort pvalue support
          psupport <- sort(temp,decreasing=FALSE)
          
          # ranomized p-value
        randPval = pvalue - mean(runif(1))*eqProb  
        randPval = min(1,randPval)
        
        # mid p-value
        MidPval = lessProb + 0.5*eqProb
        MidPval = min(1,MidPval)
          # return all three p-values
          pvals = c(pvalue,MidPval,randPval)
           
          # save mean to the fist entry
          support<- c(pvals, psupport)
          return(support)
          }

      
      #############################################################
      ######## one-sided p-value and its support for FET ##########
      ########################################################
      pvalOneSideFETSupport <- function(cellmatrix, Side = "Right")
      {
        ns = cellmatrix[1,1]; nw = sum(cellmatrix[,1]); nb = sum(cellmatrix[,2]); nd = sum(cellmatrix[1,]);
        drx = dhyper(ns, nw, nb, nd)
        lo <- max(0, nd - nb);   hi <- min(nd, nw)
        #          x <- x[1, 1]; m <- sum(x[, 1]);    n <- sum(x[, 2]); k <- sum(x[1, ])
        #        lo <- max(0, k - n);   hi <- min(k, m);       support <- lo:hi
        #        logdc <- dhyper(support, m, n, k, log = TRUE)
        
        if (Side == "Right") {
          # get support
          psupport = dhyper(lo:ns, nw, nb, nd) + phyper(lo:ns, nw, nb, nd,lower.tail = FALSE)
          psupport = unique(sort(psupport))
          # tail
          ptail =  phyper(ns, nw, nb, nd,lower.tail = FALSE)
          # type of p-value
          pvalO = ptail + drx
          pvalMid = ptail + 0.5*drx    #definition from HWang et al 2001
          pvalRnd = ptail + mean(runif(1))*drx   # definition from Habiger 2015
        } 
        
        if (Side == "Left") {
          # get support
          psupport = phyper(lo:nd, nw, nb, nd)
          psupport = unique(sort(psupport))
          # tail
          ptail =  phyper(ns, nw, nb, nd)
          # type of p-value
          pvalO = ptail
          pvalMid = ptail - drx + 0.5*drx     #by duality
          pvalRnd = ptail - drx + mean(runif(1))*drx  #by duality
        }
        
        # save pval and support
        pvals = c(pvalO,pvalMid,pvalRnd)
        support<- c(pvals,psupport)   
        return(support)
      }
      
      ########################################################
      ######## one-sided p-value and its support for Binomial ##########
      ########################################################
      pvalOneSideBTSupport <- function(cellvector, Side = "Right")
      {
        ns = cellvector[1]; nt = cellvector[2]
        drx = dbinom(ns, nt,0.5)
        
        if (Side == "Right") {
          # get support
          psupport = pbinom(0:ns, nt,0.5,lower.tail = FALSE) + dbinom(0:ns, nt,0.5)
          psupport = unique(sort(psupport))
          # tail
          ptail =  pbinom(ns, nt,0.5,lower.tail = FALSE)
          # type of pval
          pvalO = ptail + drx
          pvalMid = ptail + 0.5*drx
          pvalRnd = ptail + mean(runif(1))*drx 
        } 
        
        if (Side == "Left") {
          # get support
          psupport = pbinom(0:nt, nt,0.5)
          psupport = unique(sort(psupport))
          # tail
          ptail =  pbinom(ns, nt,0.5)
          # type of p-value
          pvalO = ptail
          pvalMid = ptail - drx + 0.5*drx     #by duality
          pvalRnd = ptail - drx + mean(runif(1))*drx  #by duality
        }
        
        # save pvalue and support
        pvals = c(pvalO,pvalMid,pvalRnd)
        support<- c(pvals,psupport)   
        return(support)
      }
      

########## generalized estimator of proportion
GenEstProp <- function(pvector,psupports,TuningPar=c(0.5,100))
  {
    # m is the coherent length of the argument
    m = length(pvector)
    minSupp = unlist(lapply(psupports, min))
    
    # define smallest guiding value
    maxmin = max(minSupp)
    cat("^^max of min's of supports: ",maxmin,"\n")
    
    singPsupp = which(minSupp == 1)
    NofDirac = length(singPsupp)
      
    if (NofDirac == m) {
      cat("^^All CDF's are Dicac masses ..","\n")
      genest = 1
      } else {  #  case 2: NofDirac < m; this case lasts till the end of estimation
  
        if (NofDirac >0) { # case 2(a): m>NofDirac >0
          maxminA = max(minSupp[-singPsupp])
          cat("^^second max of min's of supports: ",maxminA,"\n")
          schLoc = (1:m)[-singPsupp]
        if (maxminA < 0.5) {
          tunings = seq(maxminA+TuningPar[1]*(0.5-maxminA), 0.5,length.out=TuningPar[2])
        } else{ 
          tunings = c(maxminA)  
          }
        } else {  # case 2(b):   maxmin < 1, i.e., NofDirac = 0, i.e. no dirac mass
          schLoc = 1:m
          if (maxmin < 0.5) {
            tunings = seq(maxmin+TuningPar[1]*(0.5-maxmin), 0.5,length.out=TuningPar[2])
          } else {  
            tunings = c(maxmin)  
           }
          
        } # end of case 2(b)
      
    # start search  if  NofDirac <m
    Lt = length(tunings)
    est = double(Lt)
    cuts = double(m)
    
    for (j in 1:Lt) {
      lamb = tunings[j]
      
      for (i in schLoc) {
        psupp = psupports[[i]]
        if (sum(psupp<=lamb) >0 ) {
          tmp = unique(max(psupp[which(psupp<=lamb)]))
          loc = max(which(psupp==tmp))
          cuts[i]= psupp[loc]
        } else  {    cuts[i] = NA }
        
      }
      cutstmp = cuts[!is.na(cuts)]
      
      esttmp = sum(as.numeric(pvector[!is.na(cuts)]>cutstmp)/(1-cutstmp))/m + 
        1/((1-lamb)*m)+ sum(is.na(cuts))/m+length(singPsupp)/m
      
      est[j] = min(1,esttmp)
    }
    # take the average
    genest = mean(est[!is.nan(est)])
   } # end of case 2: NofDirac < m 
    
    return(genest)
  }

    ###################################################
    ####### Subsection 19: BH FDR procedure        ########
    ######################################################
    # BH procedure requires input of pvalues and their indices
      # BH procedure requires input of pvalues and their indices
      BHFDRApp <- function(Pvals,FDRlevel)
      {   
        if (any(is.na(Pvals)) | any(is.nan(Pvals))) { cat("^^NA or NAN in pvalues... ","\n")
        print(Pvals[is.na(Pvals)])
        }
        
        if (any(is.na(FDRlevel)) | any(is.nan(FDRlevel))) cat("^^NA or NAN FDRlevel... ","\n")
           
       if (is.vector(Pvals))  lgh3 <- length(Pvals)
        if (is.matrix(Pvals) | is.data.frame(Pvals))  lgh3 <- dim(Pvals)[1] 
        PvalAndIdx <- cbind(Pvals,seq(1:lgh3))
        PvalAndIdxOrd <- PvalAndIdx[order(PvalAndIdx[,1]),]
        
     #   cat("^^Dims of PvalAndIdxOrd is:",as.vector(dim(PvalAndIdxOrd)),"\n")
    
        BHstepups <- seq(1:lgh3)*(FDRlevel/lgh3)
        cmp3 <- PvalAndIdxOrd[,1] <= BHstepups
        scmp3 <- sum(cmp3)
    
       # collect rejections if any
        if (scmp3 == 0) {   
        print ("No rejections by BH procedure")        # when there are no rejections
        rejAndTresh <- list(matrix(numeric(0), ncol=2,nrow=0),0)       
        }  else   {  r <- max(which(cmp3))        
            #  cat("^^^ Minimax index in BH is:",r,"\n")
             # cat("^^^ Minimax threshold in BH is:",BHstepups[r],"\n")
                   if (r==1) {
                    # when r =1, a row is chosen, so should change it into a matrix of two columns
                     BHrej = as.matrix(t(PvalAndIdxOrd[1:r,]))  
                   #  print(BHrej)
                   } else {
                      BHrej <- PvalAndIdxOrd[1:r,]
                   }
               rejAndTresh <- list(BHrej,BHstepups[r])    
              }
            return(rejAndTresh)
      }


     #### rejection based on adjusted p-values
      FDRAdjustedPval <- function(Pvals,FDRlevel)
      {   
        if (any(is.na(Pvals)) | any(is.nan(Pvals))) { cat("^^NA or NAN in pvalues... ","\n")
        print(Pvals[is.na(Pvals)])
        }
        
        if (any(is.na(FDRlevel)) | any(is.nan(FDRlevel))) cat("^^NA or NAN FDRlevel... ","\n")
           
       if (is.vector(Pvals))  lgh3 <- length(Pvals)
        if (is.matrix(Pvals) | is.data.frame(Pvals))  lgh3 <- dim(Pvals)[1] 
        PvalAndIdx <- cbind(Pvals,seq(1:lgh3))
        PvalAndIdxOrd <- PvalAndIdx[order(PvalAndIdx[,1]),]
        
     #   cat("^^Dims of PvalAndIdxOrd is:",as.vector(dim(PvalAndIdxOrd)),"\n")
        cmp3 <- PvalAndIdxOrd[,1] <= FDRlevel
        scmp3 <- sum(cmp3)
    
       # collect rejections if any
        if (scmp3 == 0) {   
        print ("No rejections by BH")        # when there are no rejections
        rejAndTresh <- list(matrix(numeric(0), ncol=2,nrow=0),0)       
        }  else   {  r <- max(which(cmp3))        
            #  cat("^^^ Minimax index in BH is:",r,"\n")
             # cat("^^^ Minimax threshold in BH is:",BHstepups[r],"\n")
                   if (r==1) {
                    # when r =1, a row is chosen, so should change it into a matrix of two columns
                     BHrej = as.matrix(t(PvalAndIdxOrd[1:r,]))  
                   #  print(BHrej)
                   } else {
                      BHrej <- PvalAndIdxOrd[1:r,]
                   }
               rejAndTresh <- list(BHrej,PvalAndIdxOrd[,1][r])    
              }
            return(rejAndTresh)
      }


 ### heyse2011 adjusted p-values
    HeyseAdjFunc = function (pvec, pSupps)
      {
        m = length(pvec)
        if (m ==1)
          stop("^^^At least two p-values and their supports are needed.")
        # supports[[i]] is the support for pvec[i]; entries of supports[[i]] are ascendingly ordered wrt index
        
        # order p-values ascendingly
        pvecAscend =  pvec[order(pvec,decreasing = F)]
        
        # create BH sequence of p-values in terms of Heyse2011 rephrase; m*p_(k)/k, where p_(m) is the largest
        pBHAscend = m*pvecAscend/(1:m)
        
        # obtain Q(t) = \sum_{j=1}^m F_j(t)
        # obtain matrix F_j(p_(i))
        stepf = lapply(pSupps, function(x) stepfun(x, c(0, x)))
        p.mat = matrix(NA, m, m)
        for (i in 1:m)  {
          p.mat[i, ] = stepf[[i]](pvecAscend)
        }
        # sum of each column; obtain Q(p_(i)) = \sum_{j=1}^m F_j(p_(i))
        p.sums = apply(p.mat, 2, sum)
        # Q(p_(i))/i
        p.sums = p.sums/(1:m)
        
        # obtain Heller 2012 with orginal p-vec ordering
        pR = double(m)
        pR[m:1] = cummin(p.sums[m:1])
        op = order(pvec)
        pR = pmin(1,pR[order(op)])
        
        
        ##### obtain Heyse2011 with orginal p-vec ordering ####
        pH = double(m)
        pH[m] = pBHAscend[m]
        pH[(m-1):1] = pmin(cummin(pBHAscend[m:2]),p.sums[(m-1):1])
        pH = pmin(1, pH[order(op)])
        
        # choose which to return
        return(pH)
      }
 
    #######################################################################
    #### Subsection 18:  Storey's procedure               #########
    #######################################################################

      StoreyFDREst <- function (simpvalues,FDRlevel,pi0est)
      {
         m <- length(simpvalues)
        # step length in search depends on minimal difference between p values
         sortedpvalvec <- sort(simpvalues)
         diffsortedpvals <- diff(sortedpvalvec)
         # since diff does not compute the differece between minimal p value and zero
         # it should be added, so step length in thresh is the minimum of
         # minimal diff and min p value
         threshstep <- min(min(diffsortedpvals[diffsortedpvals > 0]),min(sortedpvalvec))

        # it is known that FDR threshold is no less than bonferroni, but no larger than
        # FDRlevel. the step length should be sensitive to distances bewteen p values to
        # make practical change in R(t) as t changes
        # sometimes threshstep can be so small that R itself can not handle it

        # Step 1: crude search first   do not start from min(sortedpvalvec) when m is small
        threshveccrude <- seq(from= 0,to=1,by=10^-3)
        ecdfcrude0 <- max(sum(sortedpvalvec <= threshveccrude[1]),1)/m

        istcrude <-1
        stFDPcrude <- pi0est*threshveccrude[1]/ecdfcrude0
         stFDPcrude0 <- stFDPcrude
        while (stFDPcrude <= FDRlevel)
        {
          istcrude <- istcrude+1;
          # check if mat threshol (t=1) is reach, is so, break
          if (istcrude > length(threshveccrude)){print("max threshold (t=1) in crude search for Storey's estimators exhausted");break}
          steCDFcrude <- max(sum(sortedpvalvec <= threshveccrude[istcrude]),1)/m
          stFDPcrude <- pi0est*threshveccrude[istcrude]/steCDFcrude
          if (is.na(stFDPcrude) | is.nan(stFDPcrude)) print("NA or NaN occured as FDP of Storey's estimators")

        }
        if (istcrude == 2) { stsupthreshcrude <- threshveccrude[1]
             stFDPcrudeAtSThresh <- stFDPcrude0    }     else {
              stsupthreshcrude <- threshveccrude[istcrude-1]
             ecdfstsupthreshcrude <- max(sum(sortedpvalvec <= stsupthreshcrude),1)/m
        stFDPcrudeAtSThresh <- pi0est*stsupthreshcrude/ecdfstsupthreshcrude  }
      #  cat("stFDPcrudeAtSThresh is",stFDPcrudeAtSThresh,"at sup thresh",stsupthreshcrude,"\n")

        ############################################
        #### step 2: after a crude threshold has been located, do a finer search from the crude sup threshold
        ### chen's fdr estimator
        # check if mat threshol (t=1) is reach, is so, break

         # use while loop for storey's
         if (stsupthreshcrude == 1) {  print("max threshold (t=1) for Storey's estimators exhausted, no fine search in second round")
            stsupthresh <- stsupthreshcrude
         stFDPAtSThresh <- stFDPcrudeAtSThresh} else {
        threshvecst <- seq(from = stsupthreshcrude,to=threshveccrude[istcrude],
                               by= min((threshveccrude[istcrude]-stsupthreshcrude)/m,FDRlevel/m))
         ecdf0st <- max(sum(sortedpvalvec <= threshvecst[1]),1)/m
         stFDP <- pi0est*threshvecst[1]/ecdf0st
          stFDP0 <- stFDP
         ist <-1
        while (stFDP <= FDRlevel)
        {
          ist <- ist+1;
          if (ist > length(threshvecst)) break
          steCDF <- max(sum(sortedpvalvec <= threshvecst[ist]),1)/m
          stFDP <- pi0est*threshvecst[ist]/steCDF
          if (is.na(stFDP) | is.nan(stFDP)) print("NA or NaN occured as FDP for Storey's estimators in fine search")
        }
        if (ist==2) {stsupthresh <- threshvecst[1]
          stFDPAtSThresh <- stFDP0     } else {
        stsupthresh <- threshvecst[ist-1]
         ecfstsupthresh <- max(sum(sortedpvalvec <= stsupthresh),1)/m
        stFDPAtSThresh <- pi0est*stsupthresh/ecfstsupthresh }
       #  cat("stFDPAtSThresh",stFDPAtSThresh,"at sup thresh",stsupthresh,"\n")
      } # end of esle


        # get discoveries
        stDiscovery <- which(simpvalues <= stsupthresh)
        if (length(stDiscovery)==0) print("No discoveries found by Storey's procedures")

        StDiscoveries <- list(stDiscovery,c(stsupthresh,stFDPAtSThresh))
        return(StDiscoveries)

      }
      
     # storey: the new qvalue package stops with error when there is p-value being exactly 1
     # so the following is used
     storeyPi0Est <- function(lambda,pvector)
      {
          # m is the coherent length of the argument
           m <- length(pvector)

          over <- m*(1-lambda)
          stunmin <- sum(pvector > lambda)/over 
          st <- min(1,stunmin)
          if (is.nan(st))  print("Storey's estimator of pi0 is NaN")
          if (is.na(st))  print("Storey's estimator of pi0 is NA")

          return(st)
      }

Try the fdrDiscreteNull package in your browser

Any scripts or data that you put into this service are public.

fdrDiscreteNull documentation built on April 25, 2020, 1:11 a.m.