R/glmWrapper.R

Defines functions outputFunc2 lkhrFitOneProbe lkhrWrapper glmWrapper glmFitOneProbe logit getDesignMat.glm checkCovariate getCovInterest inputChecking

Documented in glmWrapper lkhrWrapper

inputChecking<-function(pvalAdjMethod, es, probeID.var,
  gene.var, chr.var)
{
  pvalAdjMethod=match.arg(pvalAdjMethod, choices=c("holm", "hochberg", 
    "hommel", "bonferroni", "BH", "BY", "fdr", "none"))

  # extract feature data
  fDat<-fData(es)
  cn<-colnames(fDat)

  # get probe IDs, gene symbos and chromosome numbers
  pos1<-which(cn == probeID.var)
  if(length(pos1))
  { 
    probeIDs<-fDat[, pos1]
  } else {
    stop(paste("no variable", probeID.var, "in feature data", sep=" "))
  }

  pos1<-which(cn == gene.var)
  if(length(pos1))
  { 
    geneSymbols<-fDat[, pos1]
  } else {
    stop(paste("no variable", gene.var, "in feature data", sep=" "))
  }

  pos1<-which(cn == chr.var)
  if(length(pos1))
  { 
    chr<-fDat[, pos1]
  } else {
    stop(paste("no variable", chr.var, "in feature data", sep=" "))
  }


  # extract expression data
  dat<-exprs(es)

  # extract phenotype data
  pDat<-pData(es)

  res<-list(dat=dat, pDat=pDat, fDat=fDat,
    probeIDs = probeIDs, geneSymbols = geneSymbols, chr=chr)

}

getCovInterest<-function(formula, pos.var.interest, formulaType)
{
  if(pos.var.interest==0)
  {
    var.interest<-"(Intercept)"
  } else {
    fla<-as.character(formula)
    if(formulaType=="both")
    {
      # fla[1] = "~"
      # fla[2] = response variable
      # fla[3] = right-hand-side of "~" in formula
      fla3<-fla[3]
    } else {
      # fla[1] = "~"
      # fla[2] = right-hand-side of "~" in formula
      fla3<-fla[2]
    }

    tt<-strsplit(fla3, split="\\+")
    tt2<-as.character(sapply(tt, function(x) { limma::trimWhiteSpace(x)}))
    var.interest<-tt2[pos.var.interest]
  }
  return(var.interest)
}

# Check if the covariate of interest is a
# factor or interaction term with more than 2 levels.
# Return the columns in designMat containing all levels
#  of the covariate of interest.
#
checkCovariate<-function(
  formula, 
  cn.designMat, 
  pos.var.interest,
  formulaType="both")
{
  formulaType<-match.arg(formulaType, choices=c("both", "right"))

  var.interest<-getCovInterest(formula=formula, 
    pos.var.interest=pos.var.interest, formulaType=formulaType)

  if(pos.var.interest==0)
  { # the covariate of interest is the intercept
    # the intercept is the 1-st column of designMat
    colDesignMat<-1
    res<-list(var.interest=var.interest, colDesignMat=colDesignMat)
    return(res)
  } 

  cn<-cn.designMat

  # check if var.interest is an interaction term
  ttpos<-grep(pattern="\\*", x=var.interest, ignore.case =TRUE)
  ttpos2<-grep(pattern="\\:", x=var.interest, ignore.case =TRUE)
  if(length(ttpos) || length(ttpos2))
  { # var.interest is an interaction term
    if(length(ttpos))
    {
      tt3<-strsplit(var.interest, split="\\*")
    } else {
      tt3<-strsplit(var.interest, split="\\:")
    }
    # tt4 contains all covariates in the interaction term 
    tt4<-as.character(sapply(tt3, function(x) { limma::trimWhiteSpace(x)}))

    # check which columns of designMat contains all elements of tt4
    tt7<-unlist(sapply(cn, function(x) {
        tt5<-as.numeric(sapply(tt4, function(y) {
          tt6<-grep(pattern=y, x=x, fixed=TRUE)
          if(length(tt6))
          { 
            return(1) 
          } else {
            return(0) 
          }
        }))
        if(sum(tt5, na.rm=TRUE)==length(tt4))
        { # this column of designMat contains one level of var.interest  
          return(1)
        } else {
          # this column of designMat does not contain the leve of var.interest 
          return(0) 
        }
      }
    ))
    # return columns in designMat containing 
    # the all levels  of the covariate of the interest
    colDesignMat<-which(tt7==1)
    res<-list(var.interest=var.interest, colDesignMat=colDesignMat)
    return(res)
  } else {
    # not an interaction term
    # but might be a factor
    ttpos2<-grep(pattern=var.interest, x=cn, fixed=TRUE)
    ttlen<-length(ttpos2)

    if(ttlen==1)
    {
      # the covariate of interest is not a factor with
      # more than 2 levels
      # return columns in designMat containing 
      # the all levels  of the covariate of the interest
      colDesignMat<-ttpos2
      res<-list(var.interest=var.interest, colDesignMat=colDesignMat)
      return(res)
    } else {
      # each element of 'cn2' contains string var.interest 
      set<-1:length(cn)
      set2<-set[ttpos2]
      cn2<-cn[ttpos2]

      # check if cn2 contains interaction terms
      tt8<-as.numeric(unlist(sapply(cn2, function(x) {
          tt7<-grep(pattern="\\:", x=x, ignore.case=TRUE)
          if(length(tt7))
          {
            return(0) # not we want
          } else {
            return(1) # what we want
          }
        }
      )))
      # columns in designMat contains var.interest 
      ttpos3<-which(tt8 > 0)

      # return columns in designMat containing 
      # the all levels  of the covariate of the interest
      colDesignMat<-set2[ttpos3]
      res<-list(var.interest=var.interest, colDesignMat=colDesignMat)
      return(res)
    }
  }
}


getDesignMat.glm<-function(xi, pDat, formula)
{
  pDat.tmp<-pDat
  pDat.tmp$xi = xi
  designMat <- model.matrix(formula, pDat.tmp)
  invisible(designMat)
}

logit<-function(x)
{
  return(log(x)/(log(1-x)))
}


# put the variable interested in the first place of formula after ~
glmFitOneProbe<-function(
  xi, 
  pDat, 
  formula=xi~ratio, 
  logit=FALSE, 
  family=gaussian,
  verbose=FALSE)
{
  pDat.tmp<-pDat
  if(logit)
  {
    pDat.tmp$xi = log(xi/(1-xi))
  } else {
    pDat.tmp$xi = xi
  }

  # some times glm will exit abnormally 
  # even though we set na.action="na.exclude"
  # because some subjects containing missing values
  # Hence, we drop these subjects before calling glm
  pDat2<-pDat.tmp
  rownames(pDat2)<-1:nrow(pDat2)
  designMat <- model.matrix(formula, pDat2)
  nm<-colnames(designMat)
  rn<-as.numeric(rownames(designMat))
  # remove possible dropped subjects from data frame
  pDat.tmp<-pDat.tmp[rn,,drop=FALSE]

  ans<-try(summary(glm(formula=formula, family=family,
    data=pDat.tmp, na.action="na.exclude")))
  aaa<-attr(ans,which="class")
  if(length(aaa)>1)
  { aaa<-aaa[1] }
  if (aaa == "try-error")
  {
    nn<-length(nm)
    pvalVec <- rep(NA, nn)
    coefVec <- rep(NA, nn)
    statVec <- rep(NA, nn)
    cat("try-error; set pvalVec=NA and stat=NA\n")
  } else {

    if(verbose)
    { print(ans) }

    # some times, some covariates do not appear in
    # the ans$coefficients probably because
    # variances of these covariates are zero.
    # We need to pad NA for these covariates
    # to make sure the output for each gene probe 
    # has the same length
    nm2<-rownames(ans$coefficients)
    ttpos<-which(is.na(match(nm, nm2))==FALSE)
    statVec<-rep(NA, length(nm))
    statVec[ttpos]<-as.numeric(ans$coefficients[, 3])
    coefVec<-rep(NA, length(nm))
    coefVec[ttpos]<-as.numeric(ans$coefficients[, 1])
    pvalVec<-rep(NA, length(nm))
    pvalVec[ttpos]<-as.numeric(ans$coefficients[, 4])

  }

  res<-c(pvalVec, statVec, coefVec)
  names(res)<-c(paste("pval",nm, sep="."), paste("stat",nm, sep="."),
    paste("coef",nm, sep="."))
  return(res)
}



# pos.var.interest -- which covariate in the right-hand-side
# of ~ in the 'formula' is of the interest
#   pos.var.interest = 0 means intercept is of the interest
glmWrapper<-function(
  es,
  formula = FEV1~xi+age+gender, 
  pos.var.interest = 1,
  family=gaussian,
  logit=FALSE, 
  pvalAdjMethod="fdr", 
  alpha = 0.05, 
  probeID.var = "ProbeID", 
  gene.var = "Symbol", 
  chr.var = "Chromosome", 
  applier=lapply,
  verbose=TRUE 
  )
{

  tt<-inputChecking(
    pvalAdjMethod=pvalAdjMethod, 
    es=es, 
    probeID.var=probeID.var,
    gene.var=gene.var, 
    chr.var=chr.var)

  dat<-tt$dat
  pDat<-tt$pDat
  fDat<-tt$fDat
  probeIDs=tt$probeIDs
  geneSymbols=tt$geneSymbols
  chr=tt$chr

  t4<-applier(1:nrow(dat), function(i) {
      glmFitOneProbe(xi=dat[i,], pDat=pDat, formula=formula,
        logit=logit, family=family, verbose=FALSE)
    }
  )

  resMat<-t(sapply(t4, function(x) {x}))
  ttnc<-ncol(resMat)
  ttnc2<-ttnc/3

  # the first part is the p-value matrix
  pvalMat.unsorted<-resMat[,1:ttnc2, drop=FALSE]
  rownames(pvalMat.unsorted)<-featureNames(es)
  # the second part is the stat matrix
  statMat.unsorted<-resMat[,c((ttnc2+1):(2*ttnc2)), drop=FALSE]
  rownames(statMat.unsorted)<-featureNames(es)
  # the  third part is the coef matrix
  coefMat.unsorted<-resMat[,-c(1:(2*ttnc2)), drop=FALSE]
  rownames(coefMat.unsorted)<-featureNames(es)

  # check if the covariate of interest is a factor
  # or interaction term with more than 2 levels
  # first get example design matrix  
  designMat<-getDesignMat.glm(xi=dat[1,], pDat=pDat, 
    formula=formula)

  tmp<-checkCovariate(
    formula=formula, 
    cn.designMat=colnames(designMat), 
    pos.var.interest=pos.var.interest,
    formulaType="both")

  var.interest<-tmp$var.interest
  colDesignMat<-tmp$colDesignMat

  pval<-pvalMat.unsorted[, colDesignMat]
  stats<-statMat.unsorted[, colDesignMat]
  coef<-coefMat.unsorted[, colDesignMat]
  len<-length(colDesignMat)
  if(len>1)
  {
    # take the minimum p-value and its test statistic 
    pval<-apply(pval, 1, min, na.rm=TRUE)
    stats<-apply(stats, 1, function(x) { 
      absx<-abs(x)
      pos<-which(absx == max(absx, na.rm=TRUE))
      return(x[pos[1]])
    })
    coef<-apply(coef, 1, function(x) { 
      absx<-abs(x)
      pos<-which(absx == max(absx, na.rm=TRUE))
      return(x[pos[1]])
    })

    if(verbose)
    {
      cat("\n************************\n")
      cat("Warning: the covariate of interest ", var.interest, " is a factor\n")
      cat("or interaction term with more than 2 levels!\n") 
      cat("Only the smallest p-value will be output!\n")
      cat("Better to run the function lkhWrapper!\n")
      cat("************************\n")
    }
  }

  res<-outputFunc2(
    pval=pval, 
    stats=stats, 
    coef=coef, 
    pvalAdjMethod=pvalAdjMethod,
    probeIDs=probeIDs, 
    geneSymbols=geneSymbols, 
    chr=chr, 
    pvalMat.unsorted=pvalMat.unsorted,
    statMat.unsorted=statMat.unsorted, 
    coefMat.unsorted=coefMat.unsorted, 
    dat=dat, 
    alpha=alpha,
    formula = formula,
    var.interest = var.interest,
    verbose=verbose)

  res$resMat=resMat

  invisible(res)
}

# pos.var.interest -- which covariate in the right-hand-side
# of ~ in the 'formula' is of the interest
#   pos.var.interest = 0 means intercept is of the interest
# likelihood ratio test
lkhrWrapper<-function(
  es,
  formulaReduced = FEV1~xi+gender, 
  formulaFull = FEV1~xi+age+gender, 
  family=gaussian,
  logit=FALSE, 
  pvalAdjMethod="fdr", 
  alpha = 0.05, 
  probeID.var = "ProbeID", 
  gene.var = "Symbol", 
  chr.var = "Chromosome", 
  applier=lapply,
  verbose=TRUE 
  )
{

  tt<-inputChecking(
    pvalAdjMethod=pvalAdjMethod, 
    es=es, 
    probeID.var=probeID.var,
    gene.var=gene.var, 
    chr.var=chr.var)

  dat<-tt$dat
  pDat<-tt$pDat
  fDat<-tt$fDat
  probeIDs=tt$probeIDs
  geneSymbols=tt$geneSymbols
  chr=tt$chr

  t4<-applier(1:nrow(dat), function(i) {
      lkhrFitOneProbe(xi=dat[i,], pDat=pDat, 
        formulaReduced=formulaReduced,
        formulaFull=formulaFull,
        logit=logit, family=family, verbose=FALSE)
    }
  )

  resMat<-t(sapply(t4, function(x) {x}))
  resFrame<-data.frame(
    probeIDs=probeIDs,
    geneSymbols=geneSymbols, 
    chr=chr, 
    Chisq=resMat[,1], 
    Df=resMat[,2], 
    pval=resMat[,3])
  resFrame$p.adj<-p.adjust(resFrame$pval, method="fdr")
  resFrame$pos<-1:nrow(resFrame)

  resFrame.s<-resFrame[order(resFrame$Chisq, decreasing = TRUE),]

  if(verbose)
  {
    cat("\n*********************************\n")
    nTop=min(c(20, nrow(resFrame.s)))
    cat("\nTop ", nTop, " tests>>>\n")
    print(resFrame.s[1:nTop, ])
    cat("\n")

    cat("\nformulaReduced>>\n")
    print(formulaReduced)
    cat("\nformulaFull>>\n")
    print(formulaFull)
    cat("\n")

    cat("\nNumber of tests>>>", nrow(resFrame.s), "\n")
    cat("\nNumber of arrays>>>", nrow(pDat), "\n")
    cat("\nNumber of tests with pvalue<0.05>>>", 
      sum(resFrame.s$pval<0.05, na.rm=TRUE), "\n")
    cat("\nNumber of tests with FDR adjusted pvalue<0.05>>>", 
      sum(resFrame.s$p.adj<0.05, na.rm=TRUE), "\n")

  }

  res<-list(frame=resFrame.s, frame.unsorted=resFrame)
  invisible(res)
}

# put the variable interested in the first place of formula after ~
lkhrFitOneProbe<-function(
  xi, 
  pDat, 
  formulaReduced=xi~ratio+gender, 
  formulaFull=xi~ratio+age+gender, 
  logit=FALSE, 
  family=gaussian,
  verbose=FALSE)
{
  pDat.tmp<-pDat
  if(logit)
  {
    pDat.tmp$xi = log(xi/(1-xi))
  } else {
    pDat.tmp$xi = xi
  }

  # some times glm will exit abnormally 
  # even though we set na.action="na.exclude"
  # because some subjects containing missing values
  # Hence, we drop these subjects before calling glm
  pDat2<-pDat.tmp
  rownames(pDat2)<-1:nrow(pDat2)
  designMatFull <- model.matrix(formulaFull, pDat2)
  nm<-colnames(designMatFull)
  rn<-as.numeric(rownames(designMatFull))
  # remove possible dropped subjects from data frame
  pDat.tmp<-pDat.tmp[rn,,drop=FALSE]

  ansReduced<-try((glm(formula=formulaReduced, family=family,
    data=pDat.tmp, na.action="na.exclude")))
  aaaReduced<-attr(ansReduced,which="class")

  if(length(aaaReduced)>1)
  { aaaReduced<-aaaReduced[1] }

  ansFull<-try((glm(formula=formulaFull, family=family,
    data=pDat.tmp, na.action="na.exclude")))
  aaaFull<-attr(ansFull,which="class")

  if(length(aaaFull)>1)
  { aaaFull<-aaaFull[1] }


  if (aaaReduced == "try-error" || aaaFull == "try-error")
  {
    cat("try-error; set pval=NA and stat=NA\n")
    res<-rep(NA, 3)
    names(res)<-c("Chisq", "Df", "pval")

  } else {

    if(verbose)
    { 
      cat("\nReduced model>>>\n")
      print(ansReduced) 

      cat("\nFull model>>>\n")
      print(ansFull) 
    }

    res.lr<-lrtest(ansReduced, ansFull)
    res<-c(res.lr$Chisq[2], res.lr$Df[2], res.lr$"Pr(>Chisq)"[2])
    names(res)<-c("Chisq", "Df", "pval")
    if(verbose)
    {
      cat("\nlikelihood ratio test>>>>\n")
      print(res.lr)
      cat("\n")
    }

  }

  #res<-c(pvalVec, statVec)
  #names(res)<-c(paste("pval",nm, sep="."), paste("stat",nm, sep="."))
  return(res)
}

outputFunc2<-function(pval, stats, coef, pvalAdjMethod,
  probeIDs, geneSymbols, chr, pvalMat.unsorted,
  statMat.unsorted, coefMat.unsorted, dat, alpha, formula, 
  var.interest,
  verbose)
{
  # quantiles of p-values for all covariates including intercept 
  pval.quantile<-apply(pvalMat.unsorted, 2, quantile, na.rm=TRUE)
  rownames(pval.quantile)<-c("min", "25%", "median", "75%", "max")
  colnames(pval.quantile)<-colnames(pvalMat.unsorted)

  # adjust p-value
  p.adj<-p.adjust(pval, method=pvalAdjMethod)

  # re-order genes by the ascending order of p-value
  # for the covariate of our interest
  nGenes<-length(pval)
  mat<-cbind(pval, stats, coef, 1:nGenes)
  mat2<-mat[order(mat[,1]),]
  pos<-as.numeric(mat2[,4])
        
  # create output dataframe
  frame<-data.frame(probeIDs=probeIDs[pos], geneSymbols=geneSymbols[pos],
    chr=chr[pos],
    stats=mat2[,2], coef=mat2[,3], pval=mat2[,1], p.adj=p.adj[pos], pos=pos)
  rownames(frame)<-NULL

  # sorted p-value matrix
  pvalMat<-pvalMat.unsorted[pos, ,drop=FALSE]
  # sorted moderate t-test statistic matrix
  statMat<-statMat.unsorted[pos, ,drop=FALSE]
  # sorted coef matrix
  coefMat<-coefMat.unsorted[pos, ,drop=FALSE]

  frame.unsorted<-data.frame(probeIDs=probeIDs, geneSymbols=geneSymbols,
    chr=chr,
    stats=mat[,2], coef=mat[,3], pval=mat[,1], p.adj=p.adj, pos=1:nGenes)
  rownames(frame.unsorted)<-NULL
  rownames(statMat.unsorted)<-NULL
  rownames(pvalMat.unsorted)<-NULL
  rownames(coefMat.unsorted)<-NULL
  rownames(statMat)<-NULL
  rownames(coefMat)<-NULL
  rownames(pvalMat)<-NULL

  # number of significant gene probes after p-value adjustment
  n.sig<-sum(p.adj<alpha, na.rm=TRUE)
  if(verbose)
  { 
    nPrint=min(nrow(frame), 20)
    if(nPrint)
    { 
      print(frame[1:nPrint,])
    }

    cat("\npvalue quantiles for intercept and covariates>>\n")
    print(pval.quantile)
    cat("\n")

    cat("formula>>\n"); print(formula); cat("\n");
    cat("covariate of interest is ", var.interest, "\n")
    cat("Number of tests=", nrow(frame), "\n")
    cat("Number of arrays=", ncol(dat), "\n")
    cat("Number of significant tests (raw p-value < ", alpha, ")=", sum(frame$pval<0.05, na.rm=TRUE), "\n")
    cat("Number of significant tests after p-value adjustments=", n.sig, "\n")
    cat("\n\n**********************************************\n")
  }

  # the positive value of stats is over-expressed
  pos1<-which(p.adj<alpha & stats>0)
  len1<-length(pos1)

  # the negative value of stats is under-expressed
  pos3<-which(p.adj<alpha & stats<0)
  len3<-length(pos3)

  nGenes<-nrow(dat)
  memGenes<-rep(2, nGenes)
  memGenes2<-rep(0, nGenes)

  # mixing proportion
  if(len1)
  {
    pi.1<-len1/nGenes
    memGenes[pos1]<-1
  } else {
    pi.1<-0
  }
  if(len3)
  {
    pi.3<-length(pos3)/nGenes
    memGenes[pos3]<-3
  } else {
    pi.3<-0
  }
  pi.2<-1-pi.1-pi.3

  if(len1!=0 || len3!=0)
  {
    memGenes2[memGenes!=2]<-1
  } else {
    cat("No genes are differentially expressed!\n")
  }

  dat1<-dat[memGenes==1,,drop=FALSE]
  dat2<-dat[memGenes==2,,drop=FALSE]
  dat3<-dat[memGenes==3,,drop=FALSE]

  mu1<-apply(dat1, 2, mean, na.rm=TRUE)
  mu2<-apply(dat2, 2, mean, na.rm=TRUE)
  mu3<-apply(dat3, 2, mean, na.rm=TRUE)

  res<-list(n.sig=n.sig, frame=frame, 
    pvalMat=pvalMat,
    pval.quantile=pval.quantile,
    frame.unsorted=frame.unsorted,
    statMat.unsorted=statMat.unsorted,
    coefMat.unsorted=coefMat.unsorted,
    pvalMat.unsorted=pvalMat.unsorted,
    memGenes=memGenes, memGenes2=memGenes2, mu1=mu1, mu2=mu2, mu3=mu3)

  invisible(res)

}

Try the iCheck package in your browser

Any scripts or data that you put into this service are public.

iCheck documentation built on Nov. 1, 2018, 4:23 a.m.