SEMMS: Variable selection in Generalized Linear Models - an empirical Bayes approach

Documented in addInteractions mds2D plotFit printModelParams readInputFile runLinearModel

#' Run the Generalized Alternating Maximization algorithm to fit the SEMMS model
#'
#' Coefficients of predictors in a GLM are modeled as a three-component normal
#' mixture, where the majority of predictors are assumed to belong to the null
#' component (i.e., they have no effect on the mean of the response) and the others can
#' have a positive, or negative effect. fitSEMMS runs a Generalized Alternating
#' Maximization algorithm to fit the model.
#' @param dat The dataset, as generated by readInputFile().
#' @param mincor The minimum correlation coefficient between pairs of putative variable, over which they are considered highly correlated. Default is 0.75.
#' @param nn The initial value for the number of non-null variables. Default is 5.
#' @param nnset Optional: instead of an initial number of candidates, can specify the column numbers in the Z matrix for the first iteration. Default is null.
#' @param distribution The distribution of the response (N, P, or B).
#' @param rnd Whether to run the greedy (F, default) algorithm, or the randomized.
#' @param BHthr The Benjamini-Hochberg threshold to be used when determining the initial set of non-null variables. Default=0.01.
#' @param initWithEdgeFinder Determines whether to use the edgefinder package to find highly correlated pairs of predictors (default=TRUE).
#' @param minchange The minimum change in the log-likelihood that is to be considered meaningful. Default=1.
#' @param maxst The maximum number of iterations of the algorithm. Default=20.
#' @param ptf Whether to print output at each iteration to a file called SEMMS.log. Default=FALSE.
#' @param verbose Whether to show progress message to the user. Default=FALSE.
#' @return \item{inits}{The initial values obtained from initVals().}
#' \item{initNN}{The column numbers of the putative variables to be included in the model in the first iteration.}
#' \item{gam.out}{The output from the GAMupdate() function, which is a list containing
#' nn=the selected variables, mu,beta,s2r,s2e=the parameter estimates of the mixture model,
#' gam_nn=the sign (+/-1) of the effect of the non-null variables, pp=a table of posterior probabilities,
#' and lockedOut=any variables found to be highly correlated with selected variables.}
#' \item{distribution}{The distribution selected by the user.}
#' \item{mincor}{The user's input for mincor above. Ignored if initWithEdgeFinder is set to TRUE}
#' @keywords SEMMS Generalized Alternating Maximization algorithm
#' @aliases SEMMS
#' @export
#' @examples
#' \dontrun{
#' fn <- system.file("extdata", "AR1SIM.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:100)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)}
fitSEMMS <- function (dat, mincor = 0.7, nn = 5, nnset = NULL, distribution,
                      rnd = F, BHthr = 0.01, initWithEdgeFinder=T,
                      minchange = 1, maxst = 20, ptf = F, verbose = FALSE) {
  if (verbose) {
    cat("Initializing...\n")
  }
  y0 <- dat$Y
  if (distribution == "B") {
    y0 <- log((dat$Y + 0.5)/(1.5 - dat$Y))
  }
  if (distribution == "P") {
    y0 <- log(dat$Y + 0.1)
  }
  y0 <- scale(y0)
  if(initWithEdgeFinder) {
    M <- t(cbind(y0, dat$Z))
    effit <- edgefinder(M, BHthr = max(1e-6, 1/choose(nrow(M),2)), LOvals = 100)
    subgr <- graphComponents(effit$AdjMat[-1,-1], minCtr = 2)
    Zcols <- sort(c(which(subgr$clustNo == 0), which(subgr$iscenter ==1)))
    if (!is.null(nnset)) {
      Zcols <- sort(union(nnset, Zcols))
      nnset <- which(Zcols %in% nnset)
    } else {
      if(length(which(effit$AdjMat[1,-1] != 0) > 0))
        nnset <- which(effit$AdjMat[1,-1] != 0)
    }
    dat0 <- dat
    dat$Z <- dat0$Z[,Zcols]
    dat$K <- length(Zcols)
    dat$originalZnames <- dat0$originalZnames[Zcols]
    dat$colnamesZ <- dat0$colnamesZ[Zcols]
  }
  if (!is.null(nnset)) {
    inits <- list(discard = rep(-1, dat$K), beta = rep(0, dat$K))
    initsTmp <- initVals(as.matrix(dat$Z[, nnset], nrow = length(y0),
                                   ncol = length(nnset)), y0, mincor = mincor)
    inits$discard[nnset] <- initsTmp$discard
    inits$beta[nnset] <- initsTmp$beta
    initNN <- nnset
  } else {
    inits <- initVals(dat$Z, y0, mincor = mincor)
    slctord <- setdiff(rev(order(abs(inits$beta))), which(inits$discard >= 0))
    pvals <- dt(inits$beta * sqrt(dat$N - 2)/sqrt(1 - inits$beta^2),
                dat$N - 2)
    bhadj <- p.adjust(pvals, "BH")
    bhslct <- setdiff(which(bhadj < BHthr), which(inits$discard >= 0))
    if (length(bhslct) == 0) {
      warning("\nNo variables were significant at the chosen FDR level when initializing SEMMS!\n")
    }
    N0 <- max(nn, length(bhslct))
    if (rnd) {
      initNN <- slctord[sample(1:N0, nn)]
    }
    else {
      initNN <- slctord[1:nn]
    }
  }
  initNN <- na.omit(initNN)
  gam <- rep(0, dat$K)
  gam[initNN] <- sign(inits$beta[initNN])
  if (verbose) {
    cat("Done!\nFitting...\n")
  }
  gam.out <- GAMupdate(initNN - 1, gam[initNN], as.matrix(dat$Y),
                       as.matrix(dat$X), dat$Z, distr = distribution,
                       randomize = rnd, minchange = minchange, mincor = mincor,
                       ptf = ptf, maxsteps = maxst)
  if(initWithEdgeFinder) {
    A <- effit$AdjMat
    if (length(gam.out$nn) > 0){
      gam.out$nn <- Zcols[gam.out$nn]
      A[1, gam.out$nn+1] <- A[gam.out$nn+1, 1] <- 1
    }
    gam.out$lockedOut <- rep(0, dat0$K)
    lout <- setdiff(which((A + A%*%A)[1,] > 0), 1)
    gam.out$lockedOut[setdiff(lout-1, gam.out$nn)] <- 1
    inits$discard <- rep(-1, dat0$K)
    gam.out$A <- effit$AdjMat
  }
  if (verbose) {
    cat("Done!\n")
  }
  list(inits = inits, initNN = initNN, gam.out = gam.out,
       distribution = distribution, mincor = mincor)
}

#' Run the linear model using the selected variables
#'
#' Run glm with the selected variables (identified by nnt) as predictors.
#' @param dat The dataset, as generated by readInputFile().
#' @param nnt The column numbers in the Z matrix of the non-null variables.
#' @param distr The distribution to use when fitting the linear model (N=gaussian, P=poisson, or B=binomial).
#' @param func The function to use when fitting the model: glm or lm (default=glm).
#' @return \item{Z}{A numeric matrix of putative variables (in the columns)}
#' \item{mod}{The object returned from glm or lm.}
#' \item{aic}{The AIC of the model.}
#' \item{vif}{The variance inflation factors (if more than one variable is used in the model).}
#' @keywords GLM
#' @export
#' @examples
#' \dontrun{
#' fn <- system.file("extdata", "AR1SIM.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:100)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "N")}
runLinearModel <- function(dat, nnt, distr, func="glm") {
  distnames <- list(N="gaussian", P="poisson", B="binomial")
  distr <- distnames[[distr]]
  out        <- vector("list", 3)
  names(out) <- c("mod","aic","vif")
  z <- as.matrix(dat$Z[,nnt])
  colnamesZ <- gsub("\\*","_X_",dat$colnamesZ,perl=TRUE)
  colnames(z) <- colnamesZ[nnt]
  Xset <- intersect(1:dat$P,which(colnames(dat$X) != "intercept"))
  if (length(Xset) > 0) {
    x <- as.matrix(dat$X[,Xset])
    colnames(x) <- colnames(dat$X)[Xset]
    tt.xvars <- c(  paste(colnames(x), sep = ""))
  }  else {
    x <- c()
    tt.xvars <- c()
  }
  if(length(colnames(z)) > 0){
    tt.vars <- c(  tt.xvars, paste(colnames(z), sep = ""))
  } else {
    tt.vars <- tt.xvars
  }
  Yw <- dat$Y
  if ((distr == "poisson") && (func == "lm")) {
    Yw <- log(0.1+dat$Y)
  }
  if ((distr == "binomial") && (func == "lm")) {
    Yw <- log((dat$Y+0.5)/(1.5-dat$Y))
  }

  tt.form <- formula(paste("Yw ~ ", paste(tt.vars, collapse = " + ")))
  datf <- data.frame(cbind(Yw, x, z))
  colnames(datf)[1] <- "Yw"
  if (func == "lm") {
    out$mod <- lm(tt.form, data=datf)
  } else {
    out$mod <- glm(tt.form, data=datf, family = distr)
  }

  out$aic <- AIC(out$mod)
  if (length(colnames(z)) > 1) {
    out$vif <- vif(out$mod)
  }
  return(out)
}


#' Print the SEMMS model parameter estimates and non-null variables
#'
#' @param gam.out the element called gam.out in the list returned from fitSEMMS()
#' @keywords parameter estimates
#' @export
#' @examples \dontrun{
#' fn <- system.file("extdata", "AR1SIM.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:100)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)
#' printModelParams(fittedSEMMS$gam.out) }
printModelParams <- function(gam.out) {
  cat("SEMMS model - parameter estimates:\nbeta=",gam.out$beta,
      "\ns2e= ",gam.out$s2e, "\nmu=  ",gam.out$mu,"\ns2r= ", gam.out$s2r,
      "\nSelected columns (non-null)=", gam.out$nn,"\n\n")
}

#' Perform 2D multidimensional scaling
#'
#' As in Diaconis et al. (2008). Returns a list named conf, with
#' two columns (x,y coordinates)
#'
#' @param dist1 a distance matrix
#' @keywords multidimensional scaling
#' @export
#' @examples
#' \dontrun{plot(mds2D(as.matrix(dist(iris[,1:4])))$conf, pch=19, cex=0.7, col=4, axes=FALSE)}
#' @references Diaconis, Persi; Goel, Sharad; Holmes, Susan. Horseshoes in multidimensional scaling and local kernel methods. Ann. Appl. Stat. 2 (2008), no. 3, 777--807.
mds2D <- function(dist1) {
  # the following 5 lines are from Diaconis, Goel, Sharad and Holmes (AoAS)
  n <- ncol(dist1)
  H <- diag(rep(1,n))-matrix(1/n,n,n)
  K2 <- ((-0.5)*H%*%as.matrix(dist1)%*%H) / (n/2)
  K2.eigs <- eigen(K2)
  coord2D <- cbind(sqrt(K2.eigs$values[1])*K2.eigs$vectors[,1],
                   -sqrt(K2.eigs$values[2])*K2.eigs$vectors[,2])
  # first, move and rotate so that Y is at the origin, and the angles
  # are centered around the x axis
  ctmp <- coord2D/max(sqrt(rowSums(coord2D^2)))
  ctmp <- ctmp - cbind(rep(ctmp[1,1],nrow(dist1)),rep(ctmp[1,2],nrow(dist1)))
  thetam <- median(atan2(ctmp[,2],ctmp[,1])[-1])
  ctmp <- ctmp%*%matrix(c(cos(-thetam),-sin(-thetam),sin(-thetam),cos(-thetam)), ncol=2)
  # Next, try to space out close points a bit if needed
  minth <- pi/n
  thetas <- atan2(ctmp[,2],ctmp[,1])[-1]
  if (min(thetas)*max(thetas) < 0) {
    posth <- which(thetas > 0)
    negth <- which(thetas <= 0)
    p1 <- which.min(thetas[posth])
    n1 <- which.max(thetas[negth])
    if (thetas[p1]-thetas[n1] < minth) {
      Rt <- matrix(c(cos(minth/2),-sin(minth/2),sin(minth/2),cos(minth/2)), ncol=2)
      ctmp[posth+1,] <- ctmp[posth+1,]%*%Rt
      Rt <- matrix(c(cos(-minth/2),-sin(-minth/2),sin(-minth/2),cos(-minth/2)), ncol=2)
      ctmp[negth+1,] <- ctmp[negth+1,]%*%Rt
    }
    for (i in 1:length(posth)) {
      Rt <- matrix(c(cos(minth/2),-sin(minth/2),sin(minth/2),cos(minth/2)), ncol=2)
      sset <- which(thetas[posth] > thetas[posth[i]])
      if (length(sset) == 0) { next }
      p1 <- which.min(thetas[posth[sset]])
      if (thetas[sset[p1]]-thetas[posth[i]] < minth) {
        ctmp[posth[sset]+1,] <- ctmp[posth[sset]+1,]%*%Rt
      }
    }
    for (i in 1:length(negth)) {
      Rt <- matrix(c(cos(-minth/2),-sin(-minth/2),sin(-minth/2),cos(-minth/2)), ncol=2)
      sset <- which(thetas[negth] < thetas[negth[i]])
      if (length(sset) == 0) { next }
      n1 <- which.max(thetas[negth[sset]])
      if (thetas[negth[i]] - thetas[sset[n1]] < minth) {
        ctmp[negth[sset]+1,] <- ctmp[negth[sset]+1,]%*%Rt
      }
    }
  }
  rownames(ctmp) <- colnames(dist1)
  list(conf=ctmp)
}

#' Plot the fitted model found by SEMMS as a network.
#'
#' Plot a network of the response, Y, and all the variables that were found by SEMMS to be
#' associated with Y. If additional variables are highly correlated with ones found by
#' SEMMS, they will also be plotted, with edges between pairs of highly correlated
#' variables.
#' @param fittedGLM a list with 3 elements: mod = the glm or lm object, aic, and vif,
#' as generated by runLinearModel()
#' @param dataYXZ the SEMMS dataset, created by readInputFile()
#' @param fittedSEMMS the fitted model, returned by fitSEMMS()
#' @param ttl An optional title for the plot (default is blank)
#' @keywords Network diagram
#' @export
#' @examples
#' \dontrun{
#' fn <- system.file("extdata", "AR1SIM.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:100)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "N")
#' plotMDS(dataYXZ, fittedSEMMS, fittedGLM, ttl="AR1 simulation")}
plotMDS <- function (dataYXZ, fittedSEMMS, fittedGLM, ttl = "")  {
  nns <- fittedSEMMS$gam.out$nn
  nnl <- which(fittedSEMMS$gam.out$lockedOut > 0)
  sset <- unique(sort(c(nns, nnl, fittedSEMMS$inits$discard[nnl] + 1,
                        fittedSEMMS$inits$discard[nns] + 1)))
  if (!is.null(fittedSEMMS$gam.out$A)) {
    sset <- sort(union(fittedSEMMS$gam.out$nn,
                       which(fittedSEMMS$gam.out$lockedOut > 0)))
  }
  sset <- setdiff(sset, 0)
  if (length(sset) <= 1)
    return(NULL)
  Zs <- cbind(scale(dataYXZ$Y), dataYXZ$Z[, sort(sset)])
  colnames(Zs) <- c("Y", dataYXZ$colnamesZ[sort(sset)])
  crdist <- 1 - abs(cor(Zs))
  mdsdat <- mds2D(crdist)
  D <- max(mdsdat$conf[, 1]) - min(mdsdat$conf[, 1])
  plot(mdsdat$conf, axes = F, pch = 19, col = 0, xlab = "",
       ylab = "", cex = 0.6, xlim = c(min(mdsdat$conf[, 1]) - D/5,
                                      max(mdsdat$conf[, 1])), main=ttl)
  nnspts <- which(rownames(mdsdat$conf) %in% dataYXZ$colnamesZ[nns])
  mod <- which(colnames(Zs) %in% dataYXZ$colnamesZ[nns])
  if (!is.null(fittedSEMMS$gam.out$A)) {
    A <- fittedSEMMS$gam.out$A[c(1,1+sset),c(1,1+sset)]
  } else {
    A <- ceiling(pmax(1 - crdist - fittedSEMMS$mincor^2))
    diag(A) <- 0
    A[1, mod] <- A[mod, 1] <- 1
  }
  for (i in 1:(nrow(A) - 1)) {
    for (j in (i + 1):nrow(A)) {
      if (A[i, j] == 1) {
        col <- "navyblue"
        wid <- 2
        if ((i != 1) | !(j %in% mod)) {
          col <- "gray"
          wid <- 1
        }
        lines(c(mdsdat$conf[i, 1], mdsdat$conf[j, 1]),
              c(mdsdat$conf[i, 2], mdsdat$conf[j, 2]),  col = col, lwd = wid)
      }
    }
  }
  for (j in 1:length(nnspts)) lines(c(mdsdat$conf[1, 1], mdsdat$conf[nnspts[j], 1]),
                                    c(mdsdat$conf[1, 2], mdsdat$conf[nnspts[j], 2]),
                                    col = "navyblue", lwd = 2)
  points(mdsdat$conf[nnspts, 1], mdsdat$conf[nnspts, 2], col = "red",
         pch = 18, cex = 1.5)
  points(mdsdat$conf[-c(1, nnspts), 1], mdsdat$conf[-c(1, nnspts), 2],
         col = "orange", pch = 19, cex = 0.7)
  points(mdsdat$conf[nnspts, 1], mdsdat$conf[nnspts, 2], col = "red",
         pch = 18, cex = 1.5)
  text(mdsdat$conf[1, 1], mdsdat$conf[1, 2], "Y", col = "red",
       cex = 1.5)
  text(mdsdat$conf[nnspts, 1], mdsdat$conf[nnspts, 2], dataYXZ$originalZnames[nns],
       pos = 2, col = 2, cex = 0.7)
  text((mdsdat$conf[nnspts, 1] + mdsdat$conf[1, 1])/2,
       (mdsdat$conf[nnspts, 2] + mdsdat$conf[1, 2])/2,
       format(fittedGLM$mod$coefficients[-1], digits = 2), col = 2, cex = 0.7)
}


#' Diagnostics plots - fitted vs observed, and fitted vs. residuals
#'
#' Create two diagnostics plots for the fitted model
#' @param fittedGLM a list with 3 elements: mod = the glm or lm object, aic, and vif,
#' as generated by runLinearModel()
#' @param ttl An optional title for the plots (default is blank)
#' @keywords Diagnostics plots
#' @export
#' @examples
#' \dontrun{
#' fn <- system.file("extdata", "AR1SIM.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:100)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "N")
#' plotFit(fittedGLM)}
plotFit <- function(fittedGLM, ttl="") {
  with(fittedGLM, {
    par(mfrow=c(1,2))
    m <- min(mod$fitted.values, mod$y)
    m <- sign(m)*1.05*abs(m)
    M <- max(mod$fitted.values, mod$y)
    M <- sign(M)*1.05*abs(M)
    plot(mod$y, mod$fitted.values, xlab="y",ylab="Fitted value",pch=19,col="grey33",axes=F,cex=0.7)
    axis(1); axis(2)
    abline(0,1,col=2)

    sres <- studres(mod)
    plot(mod$fitted.values, sres,
         ylim=c(min(sres, -2.1),max(sres, 2.1)),
         xlab="Fitted values",ylab="Studentized residuals",pch=19,col="grey33",axes=F,cex=0.7)
    abline(h=0,col="red",lwd=2)
    abline(h=c(-2,2),col="blue",lty=2)
    axis(1); axis(2)
    par(mfrow=c(1,1))
  })
}

#' Read the input data and creating the data structures
#'
#' Read a text file containing the response variable, any fixed effects (possibly none),
#' and the putative variables. An intercept does not have to be in the data file - it
#' can be added by this function. The function can also add all two-way interactions.
#' The data may be loaded from an RData file, if is has been stored in the format that this
#' function generates.
#' The interaction terms are transformed using the scale function.
#' @param file The input file name.
#' @param skip How many lines to skip in the input file (default=0).
#' @param header Does the first line being read contain column names? (default=TRUE).
#' @param sep The delimiter (default is \\t).
#' @param ycol The number of the column in the input file which should be used as the response.
#' @param Zcols The columns in the input file which contain the putative variables.
#' @param Xcols The columns in the input file which contain the fixed effects in the model (default is none, c()).
#' @param addIntercept Whether to add an intercept column to the fixed effects (default=TRUE).
#' @param logTransform The column numbers which have to be log-transformed (default is none, c()).
#' @param twoWay Whether to add all two-way interactions to the putative variables (default=FALSE).
#' @return \item{Z}{A numeric matrix of putative variables (the columns of Z)}
#' \item{Y}{The response vector}
#' \item{X}{Fixed effects matrix (could be null)}
#' \item{originalZnames}{The original names of the columns of the matrix Z}
#' \item{colnamesZ}{The generated column names, Z0001, Z0002, etc.}
#' \item{N}{The number of samples, nrow(Z)}
#' \item{K}{The number of putative variables, ncol(Z)}
#' \item{P}{The number of fixed effects, ncol(X)}
#' @keywords input data format
#' @export
#' @examples
#' \dontrun{
#' fn <- system.file("extdata", "ozone.txt", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=2, Zcols=3:11,
#' addIntercept = TRUE, logTransform = 2, twoWay = TRUE)}
readInputFile <- function(file, skip=0, header=TRUE, sep="\t", ycol, Zcols,
                          Xcols=c(), addIntercept = TRUE, logTransform = c(),
                          twoWay = FALSE){
  if (length(grep("RData",file)) > 0) {
    data <- get(load(file))
  } else {
    if ((length(grep(".txt",file)) > 0) || (length(grep(".csv",file)) > 0)
        || (length(grep(".dat",file)) > 0))
      data <- read.table(file, skip=skip, header=header, sep=sep)
  }
  if (length(Xcols) == 0) { addIntercept = TRUE }
  if(length(logTransform) > 0) {
    data[,logTransform] <- log(data[,logTransform])
  }
  Y <- data[,ycol]
  N <- length(Y)
  if (addIntercept) {
    X <- cbind(matrix(1,ncol=1,nrow=N), data[,Xcols])
    P <- length(Xcols)+1
    colnames(X) = c("intercept",colnames(data)[Xcols])
  } else {
    # if addIntercept == FALSE, it is assumed the X already
    # includes the intercept column
    X <- as.matrix(data[,Xcols])
    P <- length(Xcols)
  }
  if (P > 1) {
    Xtmp <- data.frame(X)
    tmp.form <- formula(paste("~ ", setdiff(colnames(Xtmp),"intercept"), collapse="+"))
    modmat <- model.matrix(tmp.form, Xtmp)
    X <- cbind(modmat[,1],apply(as.matrix(modmat[,-1]),2,scale))
    colnames(X) <- c("intercept",colnames(modmat)[-1])
  }
  Z <- as.matrix(data[,Zcols])
  Z <- apply(Z,2,scale)
  K <- length(Zcols)
  if(twoWay) {
    Z <- addInteractions(Z)
    Z <- apply(Z,2,scale)
    K <- ncol(Z)
  }
  originalZnames <- colnames(Z)
  colnames(Z) <- sprintf("Z%04d",1:K)
  colnamesZ <- colnames(Z)
  list(Z=Z, Y=Y, X=X, originalZnames=originalZnames, colnamesZ=colnamesZ,
       N=N, K=K, P=P)
}

#' Add interaction terms for each pair of putative variables
#'
#' Create all the second-order polynomials Z_i*Z_j where Z_i are columns of the matrix
#' Z (including Z_i*Z_i), and Z is an n (samples) by p (covariates)
#' @param Z a numeric matrix of putative variables, n (samples) by p (covariates).
#' @return a matrix which includes the input matrix, Z, and all the two-way interactions.
#' @keywords interactions
#' @export
#' @examples
#' Z <- addInteractions(matrix(c(1,2,3, 11,12,13), nrow = 2,
#' ncol = 3, byrow = TRUE,dimnames = list(c("row1", "row2"),
#'   c("C.1", "C.2", "C.3"))))
addInteractions <- function(Z) {
  K <- ncol(Z)
  Zret <- matrix(0, nrow=nrow(Z), ncol=2*K+choose(K, 2))
  colnames(Zret) <- rep("", ncol(Zret))
  Zret[,1:K] <- Z
  colnames(Zret)[1:K] <- colnames(Z)
  k <- K+1
  for (i in 1:K) {
    for (j in i:K) {
      Zret[,k] = Z[,i]*Z[,j]
      colnames(Zret)[k] <- sprintf("%s*%s",colnames(Zret)[i], colnames(Zret)[j])
      k = k+1
    }
  }
  return(Zret)
}


#' Ozone data
#'
#' The air-pollution data set analyzed in this case study was first introduced in (Breiman and Friedman 1985). It consists of daily measurements of ozone concentration levels in the Los Angeles basin, collected over 330 days in 1976. There are eight meteorological explanatory variables, plut the day of the year. Note that the response (column 2) has to be log transformed. The predictos appear in columns 3-11. We add an intercept, and construct the two-way interaction terms in the example below.
#'
#' @docType data
#' @keywords datasets
#' @name ozone
#' @examples  \dontrun{
#' fn <- system.file("extdata", "ozone.txt", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=2, skip=19, Zcols=3:11,
#'   addIntercept = TRUE, logTransform = 2, twoWay = TRUE)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "N")
#' plotMDS(dataYXZ, fittedSEMMS, fittedGLM, ttl="Ozone")}
#' @format A text file with the ozone data. The data starts on line 20.
#' @references Breiman, L, and J Friedman. 1985.
#' "Estimating Optimal Transformations for Multiple Regression and Correlation."
#' Technometrics 80: 580-598
NULL


#' Simulated gene expression data
#'
#' AR1SIM is a simulated dataset with a hub structure, consisting of 1000 predictors arranged in 50 hubs. The response variable was constructed as the sum of the variables in the first hub, plus some Gaussian noise.
#'
#' @docType data
#' @keywords datasets
#' @name AR1SIM
#' @examples \dontrun{
#' fn <- system.file("extdata", "AR1SIM.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:100)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=15, minchange= 1,
#'                         distribution="N",verbose=T,rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "N")
#' plotMDS(dataYXZ, fittedSEMMS, fittedGLM, ttl="AR1 simulation")}
#' @format A 1001 by 100 matrix.
NULL

#' NKI70 data
#'
#' The NKI70 data is available from the “penalized” package in R (Goeman, 2010)
#'
#' @docType data
#' @keywords datasets
#' @name NKI70
#' @examples  \dontrun{
#' fn <- system.file("extdata", "NKI70_t1.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:73,addIntercept = TRUE)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.8, nn=6, minchange= 1,
#'   distribution="P", verbose=T, rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "P")
#' plotMDS(dataYXZ, fittedSEMMS, fittedGLM, ttl="NKI70")}
#' @format The NKI70 data has 144 observations and 72 predictors.
#' @references Goeman, J J. 2010. "L1 penalized estimation in the Cox proportional hazards model." no. 1. 14
NULL

#' Simulated gene expression data with a binary response
#'
#' SimBin is a simulated dataset with an autoregressive structure. The true predictors
#' are Z1-Z5 and Z101-Z105. Z1-Z5 have an AR(1) structure with rho=0.95, and so do Z101-Z105.
#'
#' @docType data
#' @keywords datasets
#' @name SimBin
#' @examples \dontrun{
#' fn <- system.file("extdata", "SimBin.RData", package = "SEMMS", mustWork = TRUE)
#' dataYXZ <- readInputFile(fn, ycol=1, Zcols=2:1001)
#' fittedSEMMS <- fitSEMMS(dataYXZ, mincor=0.7, nn=5, minchange  = 1,
#'      distribution="B", verbose=T, rnd=F)
#' fittedGLM <- runLinearModel(dataYXZ,fittedSEMMS$gam.out$nn, "B")
#' plotMDS(dataYXZ, fittedSEMMS, fittedGLM, ttl="Simulated Binomial (AR1)")}
#' @format A 1001 by 120 matrix.
NULL
haimbar/SEMMS documentation built on Dec. 20, 2021, 2:44 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
haimbar/SEMMS
Variable selection in Generalized Linear Models - an empirical Bayes approach

R/semmsFuncs.R
In haimbar/SEMMS: Variable selection in Generalized Linear Models - an empirical Bayes approach

Defines functions addInteractions readInputFile plotFit mds2D printModelParams runLinearModel

Documented in addInteractions mds2D plotFit printModelParams readInputFile runLinearModel

R Package Documentation

Browse R Packages

We want your feedback!

haimbar/SEMMS Variable selection in Generalized Linear Models - an empirical Bayes approach

R/semmsFuncs.R In haimbar/SEMMS: Variable selection in Generalized Linear Models - an empirical Bayes approach

Defines functions addInteractions readInputFile plotFit mds2D printModelParams runLinearModel

Documented in addInteractions mds2D plotFit printModelParams readInputFile runLinearModel

R Package Documentation

Browse R Packages

We want your feedback!

haimbar/SEMMS
Variable selection in Generalized Linear Models - an empirical Bayes approach

R/semmsFuncs.R
In haimbar/SEMMS: Variable selection in Generalized Linear Models - an empirical Bayes approach