In xpjiang-ccnu/GEEMiHC: Detecting sparse microbial association signals from longitudinal microbiome data based on generalized estimating equations

GEEMiHC <-
  function(y, id, covs = NULL, otu.tab, tree = NULL, model, Gamma = c(1,3,5,7,9), Lamda = matrix(c(1, rep(0, 8), rep(1/3, 3), rep(0, 6), rep(1/5, 5), rep(0, 4), rep(1/7, 7), rep(0, 2), rep(1/9, 9)), 5, 9, byrow = T), 
           comp = FALSE, CLR = FALSE, opt.ncl = 30, n.perm = 5000) {
    if (length(Gamma) != nrow(Lamda) | max(Gamma) != ncol(Lamda) | length(Gamma) > 100 | length(Gamma) < 1) {
      stop("The number(maximum value) of the Gamma is out of range or not equal to the number of rows(columns) in Lamda. Please enter the correct Gamma or Lamda!")
    }

    for (i in 1:nrow(Lamda)) {
      if (sum(Lamda[i,]) != 1){
        stop(paste(paste("The sum of Lamda's", i, sep = " "), paste("th row isn't 1. Please enter the correct Lamda!"), sep = ""))
      }
      if (i < nrow(Lamda) && max(Lamda[i, as.numeric((Gamma[i]+1):max(Gamma))]) != 0){
        stop("Non-zero values occupy the position of the 0 element in Lamda. Please enter the correct Lamda!")
      }
    }

    if (!comp) {
      otu.tab <- t(apply(otu.tab, 1, function(x)x/sum(x)))
    } 
    com.otu.tab <- otu.tab
    if (CLR) {
      otu.tab <- t(apply(com.otu.tab, 1, clr))
    } 
    n <- length(y)
    p <- ncol(otu.tab)
    options(warn=-1)

    if (is.null(covs)) {
      fit.GEE.AR <- MGEE(y ~ 1, id=id , family = model, corstr = "AR-1")
      f.y.GEE.AR <- fitted.values(fit.GEE.AR)
      r.GEE.AR <- y - f.y.GEE.AR

      fit.GEE.EX <- MGEE(y ~ 1, id=id , family = model, corstr = "exchangeable")
      f.y.GEE.EX <- fitted.values(fit.GEE.EX)
      r.GEE.EX <- y - f.y.GEE.EX

      fit.GEE.IN <- MGEE(y ~ 1, id=id , family = model, corstr = "independence")
      f.y.GEE.IN <- fitted.values(fit.GEE.IN)
      r.GEE.IN <- y - f.y.GEE.IN

    } else {
      fit.GEE.AR <- MGEE(y ~ ., id=id , family = model, corstr = "AR-1", as.data.frame(covs))
      f.y.GEE.AR <- fitted.values(fit.GEE.AR)
      r.GEE.AR <- y - f.y.GEE.AR

      fit.GEE.EX <- MGEE(y ~ ., id=id , family = model, corstr = "exchangeable", as.data.frame(covs))
      f.y.GEE.EX <- fitted.values(fit.GEE.EX)
      r.GEE.EX <- y - f.y.GEE.EX

      fit.GEE.IN <- MGEE(y ~ ., id=id , family = model, corstr = "independence", as.data.frame(covs))
      f.y.GEE.IN <- fitted.values(fit.GEE.IN)
      r.GEE.IN <- y - f.y.GEE.IN

    }

    r.s.GEE.AR <- list()
    r.s.GEE.EX <- list()
    r.s.GEE.IN <- list()
    for (j in 1:n.perm) {
    ind_r.s <- shuffle(n)
    r.s.GEE.AR[[j]] <- r.GEE.AR[ind_r.s]
    r.s.GEE.EX[[j]] <- r.GEE.EX[ind_r.s]
    r.s.GEE.IN[[j]] <- r.GEE.IN[ind_r.s]
    }

    n.Zs.GEE.AR <- as.numeric(r.GEE.AR) %*% otu.tab
    n.Zs.GEE.EX <- as.numeric(r.GEE.EX) %*% otu.tab
    n.Zs.GEE.IN <- as.numeric(r.GEE.IN) %*% otu.tab
    n.Z0s.GEE.AR <- lapply(r.s.GEE.AR, function(x) as.numeric(x %*% otu.tab))
    n.Z0s.GEE.EX <- lapply(r.s.GEE.EX, function(x) as.numeric(x %*% otu.tab))
    n.Z0s.GEE.IN <- lapply(r.s.GEE.IN, function(x) as.numeric(x %*% otu.tab))
    U0s.GEE.AR <- lapply(apply(sapply(r.s.GEE.AR, function(x) return(x %*% otu.tab)),1,list),unlist)    
    U0s.GEE.EX <- lapply(apply(sapply(r.s.GEE.EX, function(x) return(x %*% otu.tab)),1,list),unlist)
    U0s.GEE.IN <- lapply(apply(sapply(r.s.GEE.IN, function(x) return(x %*% otu.tab)),1,list),unlist)
    se.GEE.AR <- sapply(U0s.GEE.AR, sd)
    se.GEE.EX <- sapply(U0s.GEE.EX, sd)
    se.GEE.IN <- sapply(U0s.GEE.IN, sd)
    Zs.GEE.AR <- n.Zs.GEE.AR/se.GEE.AR
    Zs.GEE.EX <- n.Zs.GEE.EX/se.GEE.EX
    Zs.GEE.IN <- n.Zs.GEE.IN/se.GEE.IN
    Z0s.GEE.AR <- lapply(n.Z0s.GEE.AR, function(x) x/se.GEE.AR)
    Z0s.GEE.EX <- lapply(n.Z0s.GEE.EX, function(x) x/se.GEE.EX)
    Z0s.GEE.IN <- lapply(n.Z0s.GEE.IN, function(x) x/se.GEE.IN)

      if (class(tree) == "phylo") {
        CDis <- cophenetic(tree)    
        for (j in 1:nrow(CDis)) {
          ind.t <- which(CDis[j,] == 0)
          ind.d <- which(ind.t == j)
          ind.r <- ind.t[-ind.d]
          CDis[j,ind.r] <- min(CDis[j,-ind.t])/2
        }   

        if (opt.ncl == "gmx") {
          asw <- numeric(p-1)
          for (j in 2:(p-1)) {
            asw[j] <- pam(CDis, j)$silinfo$avg.width
          }
          k.best <- which.max(asw)
          clust.pam <- pam(CDis, k.best, cluster.only=TRUE)
        }
        if (opt.ncl == "fmx") {
          asw <- numeric(p-1)
          j <- 2
          asw[j] <- pam(CDis, j)$silinfo$avg.width
          while (asw[j-1] < asw[j] & j <= p-1) {
            j <- j + 1
            asw[j] <- pam(CDis, j)$silinfo$avg.width
          }     
          k.best <- j-1
          clust.pam <- pam(CDis, k.best, cluster.only=TRUE)
        }
        if (opt.ncl != "gmx" & opt.ncl != "fmx") {
          asw <- numeric(p-1)
          for (j in 2:opt.ncl) {
            asw[j] <- pam(CDis, j)$silinfo$avg.width
          }
          k.best <- which.max(asw)
          clust.pam <- pam(CDis, k.best, cluster.only=TRUE)
        }
        Ws.GEE.AR <- rep(NA, p)
        Ws.GEE.EX <- rep(NA, p)
        Ws.GEE.IN <- rep(NA, p)
        for (j in 1:p) {
          ind.1 <- match(colnames(CDis)[j], names(clust.pam))
          ind.2 <- which(clust.pam == as.numeric(clust.pam[ind.1]))
          ind.3 <- which(ind.2 == ind.1)
          ind <- ind.2[-ind.3]
          if (length(ind) == 0) {
            Ws.GEE.AR[j] <- 1
            Ws.GEE.EX[j] <- 1
            Ws.GEE.IN[j] <- 1
          } else {
            inv.Ds <- 1/(CDis[j,ind])
            abs.Zs.GEE.AR <- abs(Zs.GEE.AR[ind])
            abs.Zs.GEE.EX <- abs(Zs.GEE.EX[ind])
            abs.Zs.GEE.IN <- abs(Zs.GEE.IN[ind])
            Ws.GEE.AR[j] <- sum(inv.Ds*abs.Zs.GEE.AR)/sum(inv.Ds) + 1
            Ws.GEE.EX[j] <- sum(inv.Ds*abs.Zs.GEE.EX)/sum(inv.Ds) + 1
            Ws.GEE.IN[j] <- sum(inv.Ds*abs.Zs.GEE.IN)/sum(inv.Ds) + 1
          }
        }
        Ws.GEE.AR <- Ws.GEE.AR/sum(Ws.GEE.AR)
        Ws.GEE.EX <- Ws.GEE.EX/sum(Ws.GEE.EX)
        Ws.GEE.IN <- Ws.GEE.IN/sum(Ws.GEE.IN)

        ahc.o.GEE.AR <- as.list(GEEMiHC.stat(Zs=Zs.GEE.AR, Gamma=Gamma, Lamda=Lamda, Ws=Ws.GEE.AR))
        ahc.o.GEE.EX <- as.list(GEEMiHC.stat(Zs=Zs.GEE.EX, Gamma=Gamma, Lamda=Lamda, Ws=Ws.GEE.EX))
        ahc.o.GEE.IN <- as.list(GEEMiHC.stat(Zs=Zs.GEE.IN, Gamma=Gamma, Lamda=Lamda, Ws=Ws.GEE.IN))
        ahc.p.GEE.AR <- lapply(apply(sapply(Z0s.GEE.AR, function(x) GEEMiHC.stat(Zs=x, Gamma=Gamma, Lamda=Lamda, Ws=Ws.GEE.AR)), 1, list), unlist)
        ahc.p.GEE.EX <- lapply(apply(sapply(Z0s.GEE.EX, function(x) GEEMiHC.stat(Zs=x, Gamma=Gamma, Lamda=Lamda, Ws=Ws.GEE.EX)), 1, list), unlist)
        ahc.p.GEE.IN <- lapply(apply(sapply(Z0s.GEE.IN, function(x) GEEMiHC.stat(Zs=x, Gamma=Gamma, Lamda=Lamda, Ws=Ws.GEE.IN)), 1, list), unlist)
        pvs.GEE.AR <- mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), ahc.o.GEE.AR, ahc.p.GEE.AR)
        pvs.GEE.EX <- mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), ahc.o.GEE.EX, ahc.p.GEE.EX)
        pvs.GEE.IN <- mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), ahc.o.GEE.IN, ahc.p.GEE.IN)

        ind.pvs.GEE.AR <- 1-pchisq(Zs.GEE.AR^2, df=1)
        ind.pvs.GEE.EX <- 1-pchisq(Zs.GEE.EX^2, df=1)
        ind.pvs.GEE.IN <- 1-pchisq(Zs.GEE.IN^2, df=1)
        simes.pv.GEE.AR <- round(min(length(ind.pvs.GEE.AR)*ind.pvs.GEE.AR/rank(ind.pvs.GEE.AR)),4)
        simes.pv.GEE.EX <- round(min(length(ind.pvs.GEE.EX)*ind.pvs.GEE.EX/rank(ind.pvs.GEE.EX)),4)
        simes.pv.GEE.IN <- round(min(length(ind.pvs.GEE.IN)*ind.pvs.GEE.IN/rank(ind.pvs.GEE.IN)),4)
        simes.pv0s.ori.GEE.AR <- unlist(lapply(Z0s.GEE.AR, function(x) {xx <- 1-pchisq(x^2, df=1); min(length(xx)*xx/rank(xx))}))
        simes.pv0s.ori.GEE.EX <- unlist(lapply(Z0s.GEE.EX, function(x) {xx <- 1-pchisq(x^2, df=1); min(length(xx)*xx/rank(xx))}))
        simes.pv0s.ori.GEE.IN <- unlist(lapply(Z0s.GEE.IN, function(x) {xx <- 1-pchisq(x^2, df=1); min(length(xx)*xx/rank(xx))}))
        simes.pv0s.GEE.AR <- rep(NA, n.perm)
        simes.pv0s.GEE.EX <- rep(NA, n.perm)
        simes.pv0s.GEE.IN <- rep(NA, n.perm)

        for (j in 1:n.perm) {   
          simes.pv0s.GEE.AR[j] <- (length(which(simes.pv0s.ori.GEE.AR[-j] < simes.pv0s.ori.GEE.AR[j]))+0.01)/(n.perm+0.01)
          simes.pv0s.GEE.EX[j] <- (length(which(simes.pv0s.ori.GEE.EX[-j] < simes.pv0s.ori.GEE.EX[j]))+0.01)/(n.perm+0.01)
          simes.pv0s.GEE.IN[j] <- (length(which(simes.pv0s.ori.GEE.IN[-j] < simes.pv0s.ori.GEE.IN[j]))+0.01)/(n.perm+0.01)
        }

        simes.pv.GEE.AR <- (length(which(simes.pv0s.GEE.AR < simes.pv.GEE.AR))+0.01)/(n.perm+0.01)
        simes.pv.GEE.EX <- (length(which(simes.pv0s.GEE.EX < simes.pv.GEE.EX))+0.01)/(n.perm+0.01)
        simes.pv.GEE.IN <- (length(which(simes.pv0s.GEE.IN < simes.pv.GEE.IN))+0.01)/(n.perm+0.01)

        l.Gamma <- length(Gamma)
        Tu.GEE.AR <- min(pvs.GEE.AR[1:l.Gamma], simes.pv.GEE.AR)
        Tu.GEE.EX <- min(pvs.GEE.EX[1:l.Gamma], simes.pv.GEE.EX)
        Tu.GEE.IN <- min(pvs.GEE.IN[1:l.Gamma], simes.pv.GEE.IN)
        T0u.GEE.AR <- rep(NA, n.perm)
        T0u.GEE.EX <- rep(NA, n.perm)
        T0u.GEE.IN <- rep(NA, n.perm)
        Tw.GEE.AR <- min(pvs.GEE.AR[(l.Gamma+1):(l.Gamma*2)], simes.pv.GEE.AR)
        Tw.GEE.EX <- min(pvs.GEE.EX[(l.Gamma+1):(l.Gamma*2)], simes.pv.GEE.EX)
        Tw.GEE.IN <- min(pvs.GEE.IN[(l.Gamma+1):(l.Gamma*2)], simes.pv.GEE.IN)
        T0w.GEE.AR <- rep(NA, n.perm)
        T0w.GEE.EX <- rep(NA, n.perm)
        T0w.GEE.IN <- rep(NA, n.perm)
        for (j in 1:n.perm) {
          T0u.o.GEE.AR <- lapply(ahc.p.GEE.AR[1:l.Gamma], function(x) x[j])
          T0u.o.GEE.EX <- lapply(ahc.p.GEE.EX[1:l.Gamma], function(x) x[j])
          T0u.o.GEE.IN <- lapply(ahc.p.GEE.IN[1:l.Gamma], function(x) x[j])
          T0u.p.GEE.AR <- lapply(ahc.p.GEE.AR[1:l.Gamma], function(x) x[-j])
          T0u.p.GEE.EX <- lapply(ahc.p.GEE.EX[1:l.Gamma], function(x) x[-j])
          T0u.p.GEE.IN <- lapply(ahc.p.GEE.IN[1:l.Gamma], function(x) x[-j])
          T0u.GEE.AR[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0u.o.GEE.AR, T0u.p.GEE.AR))
          T0u.GEE.EX[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0u.o.GEE.EX, T0u.p.GEE.EX))
          T0u.GEE.IN[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0u.o.GEE.IN, T0u.p.GEE.IN))
        }
        for (j in 1:n.perm) {
          T0w.o.GEE.AR <- lapply(ahc.p.GEE.AR[(l.Gamma+1):(l.Gamma*2)], function(x) x[j])
          T0w.o.GEE.EX <- lapply(ahc.p.GEE.EX[(l.Gamma+1):(l.Gamma*2)], function(x) x[j])
          T0w.o.GEE.IN <- lapply(ahc.p.GEE.IN[(l.Gamma+1):(l.Gamma*2)], function(x) x[j])
          T0w.p.GEE.AR <- lapply(ahc.p.GEE.AR[(l.Gamma+1):(l.Gamma*2)], function(x) x[-j])
          T0w.p.GEE.EX <- lapply(ahc.p.GEE.EX[(l.Gamma+1):(l.Gamma*2)], function(x) x[-j])
          T0w.p.GEE.IN <- lapply(ahc.p.GEE.IN[(l.Gamma+1):(l.Gamma*2)], function(x) x[-j])
          T0w.GEE.AR[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0w.o.GEE.AR, T0w.p.GEE.AR))
          T0w.GEE.EX[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0w.o.GEE.EX, T0w.p.GEE.EX))
          T0w.GEE.IN[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0w.o.GEE.IN, T0w.p.GEE.IN))
        }

        T.GEE.AR <- min(pvs.GEE.AR, simes.pv.GEE.AR)
        T0.GEE.AR <- apply(cbind(T0u.GEE.AR, T0w.GEE.AR, simes.pv0s.GEE.AR),1,min)
        pv.opt.GEE.AR <- (length(which(T0.GEE.AR < T.GEE.AR))+0.01)/(n.perm+0.01)

        T.GEE.EX <- min(pvs.GEE.EX, simes.pv.GEE.EX)
        T0.GEE.EX <- apply(cbind(T0u.GEE.EX, T0w.GEE.EX, simes.pv0s.GEE.EX),1,min)
        pv.opt.GEE.EX <- (length(which(T0.GEE.EX < T.GEE.EX))+0.01)/(n.perm+0.01)

        T.GEE.IN <- min(pvs.GEE.IN, simes.pv.GEE.IN)
        T0.GEE.IN <- apply(cbind(T0u.GEE.IN, T0w.GEE.IN, simes.pv0s.GEE.IN),1,min)
        pv.opt.GEE.IN <- (length(which(T0.GEE.IN < T.GEE.IN))+0.01)/(n.perm+0.01)

        T <- min(T.GEE.AR, T.GEE.EX, T.GEE.IN)
        T0 <- apply(cbind(T0.GEE.AR, T0.GEE.EX, T0.GEE.IN),1,min)
        pv.opt <- (length(which(T0 < T))+0.01)/(n.perm+0.01)

        names(simes.pv.GEE.AR) <- "Simes.GEE(AR)"
        names(simes.pv.GEE.EX) <- "Simes.GEE(EX)"
        names(simes.pv.GEE.IN) <- "Simes.GEE(IN)"
        item.pvs.GEE.AR <- cbind(pvs.GEE.AR[1:l.Gamma], pvs.GEE.AR[(l.Gamma+1):(l.Gamma*2)])
        item.pvs.GEE.EX <- cbind(pvs.GEE.EX[1:l.Gamma], pvs.GEE.EX[(l.Gamma+1):(l.Gamma*2)])
        item.pvs.GEE.IN <- cbind(pvs.GEE.IN[1:l.Gamma], pvs.GEE.IN[(l.Gamma+1):(l.Gamma*2)])
        colnames(item.pvs.GEE.AR) <- c("uGEEHC(h).AR", "wGEEHC(h).AR")
        colnames(item.pvs.GEE.EX) <- c("uGEEHC(h).EX", "wGEEHC(h).EX")
        colnames(item.pvs.GEE.IN) <- c("uGEEHC(h).IN", "wGEEHC(h).IN")
        rownames(item.pvs.GEE.AR) <- as.character(Gamma)
        rownames(item.pvs.GEE.EX) <- as.character(Gamma)
        rownames(item.pvs.GEE.IN) <- as.character(Gamma)

        ada.pvs <- c(pv.opt.GEE.AR, pv.opt.GEE.EX, pv.opt.GEE.IN, pv.opt)
        names(ada.pvs) <- c("GEEMiHC(AR)", "GEEMiHC(EX)", "GEEMiHC(IN)", "aGEEMiHC")

        otu.ids <- colnames(otu.tab)
        otu.ids <- sub("New.ReferenceOTU", "N", otu.ids)
        otu.ids <- sub("New.CleanUp.ReferenceOTU", "NC", otu.ids)
        pvs.GEE.AR <- ind.pvs.GEE.AR
        pvs.GEE.EX <- ind.pvs.GEE.EX
        pvs.GEE.IN <- ind.pvs.GEE.IN
        i0.GEE.AR <- which(pvs.GEE.AR < 0.00000001)
        i0.GEE.EX <- which(pvs.GEE.EX < 0.00000001)
        i0.GEE.IN <- which(pvs.GEE.IN < 0.00000001)
        i1.GEE.AR <- which(pvs.GEE.AR > 0.99999999)
        i1.GEE.EX <- which(pvs.GEE.EX > 0.99999999)
        i1.GEE.IN <- which(pvs.GEE.IN > 0.99999999)
        pvs.GEE.AR[i0.GEE.AR] <- 0.00000001
        pvs.GEE.EX[i0.GEE.EX] <- 0.00000001
        pvs.GEE.IN[i0.GEE.IN] <- 0.00000001
        pvs.GEE.AR[i1.GEE.AR] <- 0.99999999
        pvs.GEE.EX[i1.GEE.EX] <- 0.99999999
        pvs.GEE.IN[i1.GEE.IN] <- 0.99999999
        Is.GEE.AR <- order(order(pvs.GEE.AR))
        Is.GEE.EX <- order(order(pvs.GEE.EX))
        Is.GEE.IN <- order(order(pvs.GEE.IN))
        exp.HC.GEE.AR <- (Is.GEE.AR/p)/sqrt(pvs.GEE.AR*(1-pvs.GEE.AR)/p)
        exp.HC.GEE.EX <- (Is.GEE.EX/p)/sqrt(pvs.GEE.EX*(1-pvs.GEE.EX)/p)
        exp.HC.GEE.IN <- (Is.GEE.IN/p)/sqrt(pvs.GEE.IN*(1-pvs.GEE.IN)/p)
        obs.HC.GEE.AR <- pvs.GEE.AR/sqrt(pvs.GEE.AR*(1-pvs.GEE.AR)/p)
        obs.HC.GEE.EX <- pvs.GEE.EX/sqrt(pvs.GEE.EX*(1-pvs.GEE.EX)/p)
        obs.HC.GEE.IN <- pvs.GEE.IN/sqrt(pvs.GEE.IN*(1-pvs.GEE.IN)/p)
        exp.wHC.GEE.AR <- Ws.GEE.AR*(Is.GEE.AR/p)/sqrt(pvs.GEE.AR*(1-pvs.GEE.AR)/p)
        exp.wHC.GEE.EX <- Ws.GEE.EX*(Is.GEE.EX/p)/sqrt(pvs.GEE.EX*(1-pvs.GEE.EX)/p)
        exp.wHC.GEE.IN <- Ws.GEE.IN*(Is.GEE.IN/p)/sqrt(pvs.GEE.IN*(1-pvs.GEE.IN)/p)
        obs.wHC.GEE.AR <- Ws.GEE.AR*pvs.GEE.AR/sqrt(pvs.GEE.AR*(1-pvs.GEE.AR)/p)
        obs.wHC.GEE.EX <- Ws.GEE.EX*pvs.GEE.EX/sqrt(pvs.GEE.EX*(1-pvs.GEE.EX)/p)
        obs.wHC.GEE.IN <- Ws.GEE.IN*pvs.GEE.IN/sqrt(pvs.GEE.IN*(1-pvs.GEE.IN)/p)
        deviation.uGEEHC.AR <- abs(exp.HC.GEE.AR - obs.HC.GEE.AR)
        deviation.uGEEHC.EX <- abs(exp.HC.GEE.EX - obs.HC.GEE.EX)
        deviation.uGEEHC.IN <- abs(exp.HC.GEE.IN - obs.HC.GEE.IN)
        deviation.wGEEHC.AR <- abs(exp.wHC.GEE.AR - obs.wHC.GEE.AR)
        deviation.wGEEHC.EX <- abs(exp.wHC.GEE.EX - obs.wHC.GEE.EX)
        deviation.wGEEHC.IN <- abs(exp.wHC.GEE.IN - obs.wHC.GEE.IN)
        rank.uGEEHC.AR <- otu.ids[order(deviation.uGEEHC.AR, decreasing = T)]
        rank.uGEEHC.EX <- otu.ids[order(deviation.uGEEHC.EX, decreasing = T)]
        rank.uGEEHC.IN <- otu.ids[order(deviation.uGEEHC.IN, decreasing = T)]
        rank.wGEEHC.AR <- otu.ids[order(deviation.wGEEHC.AR, decreasing = T)]
        rank.wGEEHC.EX <- otu.ids[order(deviation.wGEEHC.EX, decreasing = T)]
        rank.wGEEHC.IN <- otu.ids[order(deviation.wGEEHC.IN, decreasing = T)]
        return(list(graph.els = list(otu.ids = otu.ids, exp.GEEHC.AR = exp.HC.GEE.AR, exp.GEEHC.EX = exp.HC.GEE.EX, exp.GEEHC.IN = exp.HC.GEE.IN, obs.GEEHC.AR = obs.HC.GEE.AR, obs.GEEHC.EX = obs.HC.GEE.EX, obs.GEEHC.IN = obs.HC.GEE.IN, exp.wGEEHC.AR = exp.wHC.GEE.AR, exp.wGEEHC.EX = exp.wHC.GEE.EX, exp.wGEEHC.IN = exp.wHC.GEE.IN, obs.wGEEHC.AR = obs.wHC.GEE.AR, obs.wGEEHC.EX = obs.wHC.GEE.EX, obs.wGEEHC.IN = obs.wHC.GEE.IN), opt.nclust = k.best, rank.order = list(uGEEHC.AR = rank.uGEEHC.AR, uGEEHC.EX = rank.uGEEHC.EX, uGEEHC.IN = rank.uGEEHC.IN, wGEEHC.AR = rank.wGEEHC.AR, wGEEHC.EX = rank.wGEEHC.EX, wGEEHC.IN = rank.wGEEHC.IN), simes.pv.GEE.AR = simes.pv.GEE.AR, simes.pv.GEE.EX = simes.pv.GEE.EX, simes.pv.GEE.IN = simes.pv.GEE.IN, ind.pvs.GEEMiHC.AR = item.pvs.GEE.AR, ind.pvs.GEEMiHC.EX = item.pvs.GEE.EX, ind.pvs.GEEMiHC.IN = item.pvs.GEE.IN, ada.pvs=ada.pvs))
      }

      if (class(tree) != "phylo") {
        ahc.o.GEE.AR <- as.list(GEEMiHC.stat(Zs=Zs.GEE.AR, Gamma=Gamma, Lamda=Lamda, Ws=NULL))
        ahc.o.GEE.EX <- as.list(GEEMiHC.stat(Zs=Zs.GEE.EX, Gamma=Gamma, Lamda=Lamda, Ws=NULL))
        ahc.o.GEE.IN <- as.list(GEEMiHC.stat(Zs=Zs.GEE.IN, Gamma=Gamma, Lamda=Lamda, Ws=NULL))
        ahc.p.GEE.AR <- lapply(apply(sapply(Z0s.GEE.AR, function(x) GEEMiHC.stat(Zs=x, Gamma=Gamma, Lamda=Lamda, Ws=NULL)), 1, list), unlist)
        ahc.p.GEE.EX <- lapply(apply(sapply(Z0s.GEE.EX, function(x) GEEMiHC.stat(Zs=x, Gamma=Gamma, Lamda=Lamda, Ws=NULL)), 1, list), unlist)
        ahc.p.GEE.IN <- lapply(apply(sapply(Z0s.GEE.IN, function(x) GEEMiHC.stat(Zs=x, Gamma=Gamma, Lamda=Lamda, Ws=NULL)), 1, list), unlist)
        pvs.GEE.AR <- mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), ahc.o.GEE.AR, ahc.p.GEE.AR)
        pvs.GEE.EX <- mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), ahc.o.GEE.EX, ahc.p.GEE.EX)
        pvs.GEE.IN <- mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), ahc.o.GEE.IN, ahc.p.GEE.IN)

        ind.pvs.GEE.AR <- 1-pchisq(Zs.GEE.AR^2, df=1)
        ind.pvs.GEE.EX <- 1-pchisq(Zs.GEE.EX^2, df=1)
        ind.pvs.GEE.IN <- 1-pchisq(Zs.GEE.IN^2, df=1)
        simes.pv.GEE.AR <- min(length(ind.pvs.GEE.AR)*ind.pvs.GEE.AR/rank(ind.pvs.GEE.AR))
        simes.pv.GEE.EX <- min(length(ind.pvs.GEE.EX)*ind.pvs.GEE.EX/rank(ind.pvs.GEE.EX))
        simes.pv.GEE.IN <- min(length(ind.pvs.GEE.IN)*ind.pvs.GEE.IN/rank(ind.pvs.GEE.IN))
        simes.pv0s.ori.GEE.AR <- unlist(lapply(Z0s.GEE.AR, function(x) {xx <- 1-pchisq(x^2, df=1); min(length(xx)*xx/rank(xx))}))
        simes.pv0s.ori.GEE.EX <- unlist(lapply(Z0s.GEE.EX, function(x) {xx <- 1-pchisq(x^2, df=1); min(length(xx)*xx/rank(xx))}))
        simes.pv0s.ori.GEE.IN <- unlist(lapply(Z0s.GEE.IN, function(x) {xx <- 1-pchisq(x^2, df=1); min(length(xx)*xx/rank(xx))}))
        simes.pv0s.GEE.AR <- rep(NA, n.perm)
        simes.pv0s.GEE.EX <- rep(NA, n.perm)
        simes.pv0s.GEE.IN <- rep(NA, n.perm)

        for (j in 1:n.perm) {   
          simes.pv0s.GEE.AR[j] <- (length(which(simes.pv0s.ori.GEE.AR[-j] < simes.pv0s.ori.GEE.AR[j]))+0.01)/(n.perm+0.01)
          simes.pv0s.GEE.EX[j] <- (length(which(simes.pv0s.ori.GEE.EX[-j] < simes.pv0s.ori.GEE.EX[j]))+0.01)/(n.perm+0.01)
          simes.pv0s.GEE.IN[j] <- (length(which(simes.pv0s.ori.GEE.IN[-j] < simes.pv0s.ori.GEE.IN[j]))+0.01)/(n.perm+0.01)
        }

        simes.pv.GEE.AR <- (length(which(simes.pv0s.GEE.AR < simes.pv.GEE.AR))+0.01)/(n.perm+0.01)
        simes.pv.GEE.EX <- (length(which(simes.pv0s.GEE.EX < simes.pv.GEE.EX))+0.01)/(n.perm+0.01)
        simes.pv.GEE.IN <- (length(which(simes.pv0s.GEE.IN < simes.pv.GEE.IN))+0.01)/(n.perm+0.01)

        l.Gamma <- length(Gamma)
        Tu.GEE.AR <- min(pvs.GEE.AR, simes.pv.GEE.AR)
        Tu.GEE.EX <- min(pvs.GEE.EX, simes.pv.GEE.EX)
        Tu.GEE.IN <- min(pvs.GEE.IN, simes.pv.GEE.IN)
        T0u.GEE.AR <- rep(NA, n.perm)
        T0u.GEE.EX <- rep(NA, n.perm)
        T0u.GEE.IN <- rep(NA, n.perm)
        for (j in 1:n.perm) {
          T0u.o.GEE.AR <- lapply(ahc.p.GEE.AR, function(x) x[j])
          T0u.o.GEE.EX <- lapply(ahc.p.GEE.EX, function(x) x[j])
          T0u.o.GEE.IN <- lapply(ahc.p.GEE.IN, function(x) x[j])
          T0u.p.GEE.AR <- lapply(ahc.p.GEE.AR, function(x) x[-j])
          T0u.p.GEE.EX <- lapply(ahc.p.GEE.EX, function(x) x[-j])
          T0u.p.GEE.IN <- lapply(ahc.p.GEE.IN, function(x) x[-j])
          T0u.GEE.AR[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0u.o.GEE.AR, T0u.p.GEE.AR))
          T0u.GEE.EX[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0u.o.GEE.EX, T0u.p.GEE.EX))
          T0u.GEE.IN[j] <- min(mapply(function(x, y) (length(which(x < y))+0.01)/(n.perm+0.01), T0u.o.GEE.IN, T0u.p.GEE.IN))
        }
        T0u.minp.GEE.AR <- apply(cbind(T0u.GEE.AR, simes.pv0s.GEE.AR),1,min)
        T0u.minp.GEE.EX <- apply(cbind(T0u.GEE.EX, simes.pv0s.GEE.EX),1,min)
        T0u.minp.GEE.IN <- apply(cbind(T0u.GEE.IN, simes.pv0s.GEE.IN),1,min)
        pv.opt.u.GEE.AR <- (length(which(T0u.minp.GEE.AR < Tu.GEE.AR))+0.01)/(n.perm+0.01)
        pv.opt.u.GEE.EX <- (length(which(T0u.minp.GEE.EX < Tu.GEE.EX))+0.01)/(n.perm+0.01)
        pv.opt.u.GEE.IN <- (length(which(T0u.minp.GEE.IN < Tu.GEE.IN))+0.01)/(n.perm+0.01)

        T <- min(Tu.GEE.AR, Tu.GEE.EX, Tu.GEE.IN)
        T0 <- apply(cbind(T0u.minp.GEE.AR, T0u.minp.GEE.EX, T0u.minp.GEE.IN),1,min)
        pv.opt <- (length(which(T0 < T))+0.01)/(n.perm+0.01)

        names(simes.pv.GEE.AR) <- "Simes.GEE(AR)"
        names(simes.pv.GEE.EX) <- "Simes.GEE(EX)"
        names(simes.pv.GEE.IN) <- "Simes.GEE(IN)"
        item.pvs.GEE.AR <- as.matrix(pvs.GEE.AR)
        item.pvs.GEE.EX <- as.matrix(pvs.GEE.EX)
        item.pvs.GEE.IN <- as.matrix(pvs.GEE.IN)
        colnames(item.pvs.GEE.AR) <- "uGEEHC(h).AR"
        colnames(item.pvs.GEE.EX) <- "uGEEHC(h).EX"
        colnames(item.pvs.GEE.IN) <- "uGEEHC(h).IN"
        rownames(item.pvs.GEE.AR) <- as.character(Gamma)
        rownames(item.pvs.GEE.EX) <- as.character(Gamma)
        rownames(item.pvs.GEE.IN) <- as.character(Gamma)

        ada.pvs <- c(pv.opt.u.GEE.AR, pv.opt.u.GEE.EX, pv.opt.u.GEE.IN, pv.opt)
        names(ada.pvs) <- c("GEEMiHC(AR)", "GEEMiHC(EX)", "GEEMiHC(IN)", "aGEEMiHC")

        otu.ids <- colnames(otu.tab)
        otu.ids <- sub("New.ReferenceOTU", "N", otu.ids)
        otu.ids <- sub("New.CleanUp.ReferenceOTU", "NC", otu.ids)
        pvs.GEE.AR <- ind.pvs.GEE.AR
        pvs.GEE.EX <- ind.pvs.GEE.EX
        pvs.GEE.IN <- ind.pvs.GEE.IN
        i0.GEE.AR <- which(pvs.GEE.AR < 0.00000001)
        i0.GEE.EX <- which(pvs.GEE.EX < 0.00000001)
        i0.GEE.IN <- which(pvs.GEE.IN < 0.00000001)
        i1.GEE.AR <- which(pvs.GEE.AR > 0.99999999)
        i1.GEE.EX <- which(pvs.GEE.EX > 0.99999999)
        i1.GEE.IN <- which(pvs.GEE.IN > 0.99999999)
        pvs.GEE.AR[i0.GEE.AR] <- 0.00000001
        pvs.GEE.EX[i0.GEE.EX] <- 0.00000001
        pvs.GEE.IN[i0.GEE.IN] <- 0.00000001
        pvs.GEE.AR[i1.GEE.AR] <- 0.99999999
        pvs.GEE.EX[i1.GEE.EX] <- 0.99999999
        pvs.GEE.IN[i1.GEE.IN] <- 0.99999999
        Is.GEE.AR <- order(order(pvs.GEE.AR))
        Is.GEE.EX <- order(order(pvs.GEE.EX))
        Is.GEE.IN <- order(order(pvs.GEE.IN))
        exp.HC.GEE.AR <- (Is.GEE.AR/p)/sqrt(pvs.GEE.AR*(1-pvs.GEE.AR)/p)
        exp.HC.GEE.EX <- (Is.GEE.EX/p)/sqrt(pvs.GEE.EX*(1-pvs.GEE.EX)/p)
        exp.HC.GEE.IN <- (Is.GEE.IN/p)/sqrt(pvs.GEE.IN*(1-pvs.GEE.IN)/p)
        obs.HC.GEE.AR <- pvs.GEE.AR/sqrt(pvs.GEE.AR*(1-pvs.GEE.AR)/p)
        obs.HC.GEE.EX <- pvs.GEE.EX/sqrt(pvs.GEE.EX*(1-pvs.GEE.EX)/p)
        obs.HC.GEE.IN <- pvs.GEE.IN/sqrt(pvs.GEE.IN*(1-pvs.GEE.IN)/p)
        deviation.uGEEHC.AR <- abs(exp.HC.GEE.AR - obs.HC.GEE.AR)
        deviation.uGEEHC.EX <- abs(exp.HC.GEE.EX - obs.HC.GEE.EX)
        deviation.uGEEHC.IN <- abs(exp.HC.GEE.IN - obs.HC.GEE.IN)
        rank.uGEEHC.AR <- otu.ids[order(deviation.uGEEHC.AR, decreasing = T)]
        rank.uGEEHC.EX <- otu.ids[order(deviation.uGEEHC.EX, decreasing = T)]
        rank.uGEEHC.IN <- otu.ids[order(deviation.uGEEHC.IN, decreasing = T)]
        return(list(graph.els = list(otu.ids = otu.ids, exp.GEEHC.AR = exp.HC.GEE.AR, exp.GEEHC.EX = exp.HC.GEE.EX, exp.GEEHC.IN = exp.HC.GEE.IN, obs.GEEHC.AR = obs.HC.GEE.AR, obs.GEEHC.EX = obs.HC.GEE.EX, obs.GEEHC.IN = obs.HC.GEE.IN), rank.order = list(uGEEHC.AR = rank.uGEEHC.AR, uGEEHC.EX = rank.uGEEHC.EX, uGEEHC.IN = rank.uGEEHC.IN), simes.pv.GEE.AR = simes.pv.GEE.AR, simes.pv.GEE.EX = simes.pv.GEE.EX, simes.pv.GEE.IN = simes.pv.GEE.IN, ind.pvs.GEEMiHC.AR = item.pvs.GEE.AR, ind.pvs.GEEMiHC.EX = item.pvs.GEE.EX, ind.pvs.GEEMiHC.IN = item.pvs.GEE.IN, ada.pvs = ada.pvs))
      }

  }

1. Introduction

This R package, GEEMiHC, can be used for detecting sparse microbial association signals adaptively from longitudinal microbiome data. It can be applied to datasets with diverse types of outcomes to study the association between diverse types of host phenotype and microbiome, such BMI (Gaussian distribution), disease status (Binomial distribution) or number of tumors (Poisson distribution). Considering cross-sectional data as a special case of longitudinal data, it can be also applied to cross-sectional data, in which case the results will be consistent with MiHC.

2. Installation

2.1. Requisite R packages

Our R package GEEMiHC requires these existing R packages ("phyloseq", "cluster", "compositions", "permute", "PGEE", "vegan", "ape", "dirmult", "aSPU", "MiSPU") to be pre-imported, where "phyloseq" is available in Bioconductor and the others are available in CRAN. We need to install these package using the following code:

BiocManager::install("phyloseq")
install.packages("cluster")
install.packages("compositions")
install.packages("permute")
install.packages("PGEE")
install.packages("vegan")
install.packages("ape")
install.packages("dirmult")
install.packages("aSPU")
install.packages("MiSPU")

2.2. R package GEEMiHC

You may install GEEMiHC from GitHub using the following code:

install.packages("devtools")
devtools::install_github("xpjiang-ccnu/GEEMiHC", force=T)

3. Model

3.1. Generalized estimating equations

We establish the connection between the marginal mean and other factors by specifying that

\begin{equation} \begin{aligned}[b] g({\mu}{it}) = \textbf{X}{it}^T \alpha + {\textbf{o}}{it}^T \beta, \end{aligned} \end{equation} where $\textbf{o}{it} = (o_{it1}, o_{it2}, \ldots, o_{itm})^T$ represents the abundance data of $m$ OTUs, $\textbf{X}{it} = (1, x{it1}, \ldots, x_{itl})^T$ is $(l+1)$-dimension vectors of covariates and $\mu {it}$ represents the expectation of $y{it}$. And we can obtain the estimator of regression parameter $\theta = (\alpha^T,\beta^T)^T$ by solving the following:

\begin{equation} \begin{aligned}[b] \textbf{S}(\theta) & = \sum_{i=1}^n (\frac{\partial \mu_i(\theta)}{\partial \theta ^T})^T \textbf{V}^{-1}i (\textbf{Y}_i-\mu_i(\theta)) \ & = \sum{i=1}^n {\begin{pmatrix} \textbf{X}{i}^T \ \textbf{o}{i}^T \end{pmatrix}} \textbf{A}_i(\theta)\textbf{V}^{-1}_i(\textbf{Y}_i-\mu_i(\theta))=0. \end{aligned} \end{equation} where $\textbf{V}_i=\textbf{A}^{1/2}_i(\theta)\textbf{R}(\tau)\textbf{A}^{1/2}_i(\theta)$ is the working covariance matrix and $\textbf{R}(\tau)$ is the working correlation matrix.

3.2. Marginal score statistics

The GEE-based marginal score statistic is as follows: \begin{equation}\label{Z} \begin{aligned}[b] Z^{\rm GEE}j = \frac{\textbf{o}^T{{.}j}(\textbf{Y}-\hat{\mu}0)}{\sqrt{\textbf{o}^T{{.}j} \textbf{P} \textbf{o}{{.}j}}}, \end{aligned} \end{equation} where $\textbf{o}{.j} = (o_{11j}, o_{12j}, \ldots, o_{n {k_n} j})^T$ and $\textbf{Y} = (\textbf{Y}{1}^T, \ldots, \textbf{Y}_n^T)^T$. The vector of the expected values $\hat{\mu}_0$ consists of the estimator of ${\mu}{it}$

3.3. GEE-based higher criticism analyses

We consider association signals with close phylogenetic relevance to be amplified by adding the weighted factor $w_j$ to the GEE-based HC test (i.e., GEEHC test). Following the HC test and MiHC, the test statistics for the original and weighted GEE-based HC test are as follows: \begin{equation}\label{uGEEHC} \begin{aligned}[b] {\rm uGEEHC}=\max {j \in {{1, 2, \ldots, m}}} {\frac{r_j / m - p_j}{\sqrt{p_j (1-p_j)/m}}}, \end{aligned} \end{equation} \begin{equation}\label{wGEEHC} \begin{aligned}[b] {\rm wGEEHC}=\max {j \in {{1, 2, \ldots, m}}} {\frac{w_j (r_j / m - p_j)}{\sqrt{p_j (1-p_j)/m}}}, \end{aligned} \end{equation} where $$ w_j = \frac{\sum_ {j' \in \zeta (j)/{{j}}}^{} \frac{1}{D_{j, j'}} |Z^{\rm GEE}{j'}|}{\sum {j' \in \zeta (j)/{{j}}}^{} \frac{1}{D_{j, j'}}} +1 $$ is the weight of the $j$th OTU and $D_{j, j'}$ denotes the cophenetic distance and $\zeta (j)$ is a cluster with $j$th OTU by using the partitioning-around-medoids algorithm (PAM).

3.4. Adjustments for low sparsity levels

To adapt to low sparsity levels, we can make appropriate adjustments: \begin{equation}\label{uGEEHC(lambda)} \begin{aligned}[b] {\rm uGEEHC}{(\lambda)} = \sum {j'=1}^\lambda {\lambda_{j'} * {\rm uGEEHC}{j'}}, \end{aligned} \end{equation} \begin{equation}\label{wGEEHC(lambda)} \begin{aligned}[b] {\rm wGEEHC}{(\lambda)} = \sum {j'=1}^\lambda {\lambda{j'} * {\rm wGEEHC}{j'}}, \end{aligned} \end{equation} where $\lambda{j'}$ is the $j'$th weight and $\sum {j'=1}^\lambda \lambda{j'} = 1$.

3.5. Microbiome higher criticism analysis for longitudinal data

The form of GEEMiHC is as follows: \begin{equation}\label{GEEMiHC} \begin{aligned}[b] T_{\rm GEEMiHC} = \min (\min {\lambda \in \Gamma} (P{{\rm uGEEHC}{(\lambda)}},P{{\rm wGEEHC}{(\lambda)}}), P{\rm Simes}), \quad \; \end{aligned} \end{equation} where $P_{{\rm uGEEHC}{(\lambda)}}$ and $P{{\rm wGEEHC}{(\lambda)}}$ are the $p$-values for the test statistics ${\rm uGEEHC}{(\lambda)}$ and ${\rm wGEEHC}{(\lambda)}$, respectively; and $P{\rm Simes}$ is the $p$-value for the Simes test \cite{Simes(1986)}, whose model is as follow: \begin{equation}\label{Simes} \begin{aligned}[b] P_{\rm Simes} = T_{\rm Simes} = \min {j \in {{1, 2, \ldots, m}}} {\frac{m p_j}{r_j}}. \end{aligned} \end{equation} To accommodate these uncertain correlation structures, it is valuable to analyze various related structures to obtain more accurate conclusions. The adaptive GEEMiHC (aGEEMiHC) integrates multiple GEEMiHCs with different structures, whose model is as follow: \begin{equation}\label{aGEEMiHC} \begin{aligned}[b] T{\rm aGEEMiHC} = \min (T^{\rm AR}{\rm GEEMiHC}, T^{\rm EX}{\rm GEEMiHC}, T^{\rm IN}{\rm GEEMiHC}), \end{aligned} \end{equation} where $T^{\rm AR}{\rm GEEMiHC}$, $T^{\rm EX}{\rm GEEMiHC}$ and $T^{\rm IN}{\rm GEEMiHC}$ represent test statistics for GEEMiHC(AR), GEEMiHC(EX) and GEEMiHC(IN), respectively (Eq. \ref{GEEMiHC}). For longitudinal data with an unknown underlying correlation within clusters, GEEMiHCs cannot always have robust stability, but aGEEMiHC can perform well.

4. Implementation

First of all, we import requisite R packages:

library(cluster)
library(permute)
library(phyloseq)
library(PGEE)  
library(GEEMiHC)

Then, we import example microbiome data:

data(CD_longitudinal)
CD_longitudinal

#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 372 taxa and 81 samples ]
#> sample_data() Sample Data:       [ 81 samples by 4 sample variables ]
#> phy_tree()    Phylogenetic Tree: [ 372 tips and 371 internal nodes ]

otu.tab <- CD_longitudinal@otu_table
tree <- CD_longitudinal@phy_tree
y <- sample_data(CD_longitudinal)$label
covs <- data.frame(matrix(NA, length(y), 2))
covs[,1] <- as.numeric(sample_data(CD_longitudinal)$age)
covs[,2] <- as.factor(sample_data(CD_longitudinal)$smoker)
id <- sample_data(CD_longitudinal)$id

The microbiome data we use includes OTU count table, phylogenetic tree and sample information, where sample information contains response Variable (e.g., disease status) and covariates (e.g., age, gender). Next, we test the association between CD status and microbial composition using aGEEMiHC:

set.seed(123)
out <- GEEMiHC(y, id, covs=covs, otu.tab=otu.tab, tree=tree, model="binomial", n.perm=5000)

We can find significant association between CD status and microbial composition based on aGEEMiHC (p-value $\approx$ 0.006 < 0.05), and rank order for significant factors (Lachnospiraceae, No. "3586" "7929" "9498" et al., and Ruminococcaceae, No. "1784" "4760" "594" et al.) associated with the host phenotype. Lachnospiraceae and Ruminococcaceae have ever reported differences in these bacteria between CD patients and healthy people.

out$ind.pvs.GEEMiHC.AR

#>   uGEEHC(h).AR wGEEHC(h).AR
#> 1   0.01980196   0.02520195
#> 3   0.02400195   0.02320195
#> 5   0.02280195   0.02140196
#> 7   0.02200196   0.02120196
#> 9   0.02140196   0.02060196


out$ind.pvs.GEEMiHC.EX

#>   uGEEHC(h).EX wGEEHC(h).EX
#> 1   0.02060196   0.02600195
#> 3   0.02480195   0.02400195
#> 5   0.02480195   0.02220196
#> 7   0.02260195   0.02260195
#> 9   0.02180196   0.02140196

out$ind.pvs.GEEMiHC.IN

#>   uGEEHC(h).IN wGEEHC(h).IN
#> 1   0.02520195   0.02700195
#> 3   0.02180196   0.01920196
#> 5   0.01940196   0.01660197
#> 7   0.01780196   0.01540197
#> 9   0.01700197   0.01480197

out$ada.pvs

#> GEEMiHC(AR) GEEMiHC(EX) GEEMiHC(IN)    aGEEMiHC 
#> 0.004801990 0.004201992 0.012601975 0.005601989

out$rank.order$uGEEHC.AR[1:50]

#>  [1] "3586" "8895" "2430" "2556" "7929" "5929" "7825" "1784" "8608" "288" 
#> [11] "4760" "9498" "6918" "594"  "4861" "8792" "587"  "6618" "9427" "4400"
#> [21] "4193" "817"  "1298" "6357" "6737" "9648" "1726" "9634" "1658" "3561"
#> [31] "4128" "4645" "5264" "518"  "7005" "5961" "4088" "9854" "8086" "1515"
#> [41] "8853" "3271" "3119" "9514" "5816" "5102" "2425" "9304" "1863" "9488"

5. Function SimulateOTU

5.1. Description

This function can be used for the OTUs count table simulated according to real data. The simulation process is implemented based on the Dirichlet-multinomial model. We first calculate the parameters of the real data, and then generate the corresponding cross-sectional simulated data according to the parameters.

5.2. Example

Parameter estimation:

library(dirmult) 
data("throat.otu.tab", package = "MiSPU")
nOTU = 100
otu_sum <- apply(throat.otu.tab, 2, sum)
throat.otu.tab.100 <- throat.otu.tab[, order(otu_sum, decreasing = T)[1:nOTU]]
parameters <- dirmult(throat.otu.tab.100)
parameters

Generation of microbiome data:

otu.tab <- SimulateOTU(throat.otu.tab.100, nSam = 50, parameters, mu = 1000, size = 25)

6. References

Sun H, et al. Detecting sparse microbial association signals adaptively from longitudinal microbiome data based on generalized estimating equations. (under review)
Liang K and Zeger S. Longitudinal data analysis using generalized linear models. Biometrika 1986;73:13–22.
Koh H and Zhao N. A powerful microbial group association test based on the higher criticism analysis for sparse microbial association signals. Microbiome 2020;8(1):63.
Paradis E, et al. APE: Analyses of phylogenetics and evolution in R language. Bioinformatics 2004;20(2):289-290.
Reynolds A, et al. Clustering rules: A comparison of partitioning and hierarchical clustering algorithms. J Math Model Algor 2006;5:475–504.
Schirmer M, et al. Microbial genes and pathways in inflammatory bowel disease. Nat Rev Microbiol 2019;17(8):497–511.
Simes RJ. An improved Bonferroni procedure for multiple tests of significance. Biometrika 1986;73(3):751-754.
Vázquez-Baeza Y., et al. Guiding longitudinal sampling in IBD cohorts. Gut 2018;67:1743-1745.
Wang L. GEE analysis of clustered binary data with diverging number of covariates. Ann. Statist. 2011;39:389–417.
Wang L, et al. Penalized generalized estimating equations for high-dimensional longitudinal data analysis. Biometrics 2012;68(2):353-360.
Wu C, et al. An adaptive association test for microbiome data. Genome Med 2016;8(1):56.

xpjiang-ccnu/GEEMiHC documentation built on May 15, 2023, 7:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

xpjiang-ccnu/GEEMiHC
Detecting sparse microbial association signals from longitudinal microbiome data based on generalized estimating equations

In xpjiang-ccnu/GEEMiHC: Detecting sparse microbial association signals from longitudinal microbiome data based on generalized estimating equations

1. Introduction

2. Installation

2.1. Requisite R packages

2.2. R package GEEMiHC

3. Model

3.1. Generalized estimating equations

3.2. Marginal score statistics

3.3. GEE-based higher criticism analyses

3.4. Adjustments for low sparsity levels

3.5. Microbiome higher criticism analysis for longitudinal data

4. Implementation

5. Function SimulateOTU

5.1. Description

5.2. Example

6. References

R Package Documentation

Browse R Packages

We want your feedback!

xpjiang-ccnu/GEEMiHC Detecting sparse microbial association signals from longitudinal microbiome data based on generalized estimating equations

In xpjiang-ccnu/GEEMiHC: Detecting sparse microbial association signals from longitudinal microbiome data based on generalized estimating equations

1. Introduction

2. Installation

2.1. Requisite R packages

2.2. R package GEEMiHC

3. Model

3.1. Generalized estimating equations

3.2. Marginal score statistics

3.3. GEE-based higher criticism analyses

3.4. Adjustments for low sparsity levels

3.5. Microbiome higher criticism analysis for longitudinal data

4. Implementation

5. Function SimulateOTU

5.1. Description

5.2. Example

6. References

R Package Documentation

Browse R Packages

We want your feedback!

xpjiang-ccnu/GEEMiHC
Detecting sparse microbial association signals from longitudinal microbiome data based on generalized estimating equations