R/wilma.R

Defines functions predict.wilma plot.wilma fitted.wilma .p.1gen p.1clust printClist summary.wilma print.wilma wilma

Documented in fitted.wilma p.1clust plot.wilma predict.wilma printClist print.wilma summary.wilma wilma

## Wilma - an algorithm for supervised grouping of predictor variables
wilma <- function(x, y, noc, genes = NULL, flip = TRUE,
                  once.per.clust = FALSE, trace = 0)
{
    ## Checking the input
    y <- as.integer(y)
    if(any(y < 0 | y > 1))
        stop("Labels y have to be 0 or 1!")
    n <- length(y)
    if(!is.matrix(x) || !is.numeric(x) || nrow(x) != n)
        stop("'x' must be a numeric matrix with 'n' (= length(y)) observations")
    if((noc <- as.integer(noc)) < 1)
        stop("'noc' must be a positive integer")
    if((trace <- as.integer(trace)) < 0)
        stop("'trace' must be integer >= 0 (or logical)")
    if(is.na(once.per.clust <- as.logical(once.per.clust)))
        stop("'once.per.clust' must be TRUE or FALSE")
    ## C output (Cverb > 0) only for trace >= 2 :
    Cverb <- if(trace) trace - 1L else 0L

    ## Customizing the problem and sign-flipping
    iy    <- sort.list(y)# i.e., first the 0's, then the 1's
    io    <- order(iy)
    y     <- y[iy]
    x     <- x[iy,]
    signs <- NULL
    if (flip)
      {
        res   <- sign.flip(x,y)
        x     <- res$flipped.matrix
        signs <- res$signs
      }

    ## Definitions
    n1 <- sum(y == 0)
    n2 <- n - n1 ## = sum(y == 1)
    p  <- ncol(x) ## = length(x)/n


    ## Looking for (optional) starting genes
    used <- rep(FALSE, p)
    if(!is.null(genes)) {
        if(!is.list(genes) || length(genes) != noc)
            stop("starting 'genes' must be a list of length 'noc' (=", noc,")")
        sgl <- unlist(genes)
        ##was sgl <- NULL ; for (i in 1:noc) sgl   <- c(sgl, genes[[i]])
        used[sgl] <- TRUE
    }

    mn.x <- gList <- vector("list", noc)
    steps <- integer(noc)

    ## The loop for supervised clustering
    for (i in 1:noc)
    {
        ## Initial value if (optional) starting clusters are provided
        size <- length(gic <- genes[[i]])
        clMean <- rep(0,n)

        if(trace) cat("\n", "\nCluster ", i, "\n----------\n", sep="")

        ## Forward search and cleaning stages
        nFwd <- 1
        repeat { ## Search forward and backward once :

            if (size > 0)
                clMean <- rowMeans(x[, gic, drop=FALSE])

            if(trace && nFwd > 1) {
                cat("used[]:", which(as.logical(used)),"\n")
                if(trace >= 2 && size > 0)
                    cat("entry gic[]:", gic, "\n")
            }

            res <- .C(R_multicluster,
                      as.double(x), as.integer(y),# 2
                      as.integer(n), as.integer(n1), as.integer(n2),# 5
                      as.integer(p),# 6
                      used = as.integer(used),# 7
                      as.double(clMean),
                      glsize = size,# 9
                      gic    = c(as.integer(gic), integer(p)),
                      scores = integer(size+p),
                      margins=  double(size+p),
                      as.integer(once.per.clust), # 13
                      Cverb)[
                      c("used", "glsize", "gic", "scores", "margins")]

            if(trace) {
                if(nFwd > 1 && trace >= 2 && size > 0)
                    cat("exit gic[1:res$glsize]:", res$gic[1:res$glsize], "\n")
                if(res$glsize > size) {
                    cat("\nAccepted",if(size > 0) "(additionally)","\n")
                    for(j in (size+1):res$glsize)
                        .p.1gen(j,  res$ gic [j],
                                s = res$ scores[j],
                                m = res$margins[j], digits = 3)
                    cat(" gic size changed from ", size," to ",res$glsize,"\n")
                }
                else ## res$glsize == size
                    cat(" gic[] *unchanged*\n")
            }
            gic.old <- gic
            size <- res$glsize
            gic  <- res$gic[1:size]
            used <- res$used

            if(trace) cat("\nEliminating ")
            gic.red  <- back.search(gic, x, y, verbose = trace > 0)
            if(length(gic.red) == size) {
                if(trace)
                    cat(" -- no reduction --> end{repeat} after ",
                        nFwd, if(nFwd == 1)"step" else "steps","\n")
                break
            }

            ## else : *have* reduced

            w <- gic[!(gic %in% gic.red)]
            if(trace) cat(" w= (", paste(w,collapse=','),")", sep="")
            used[w] <- FALSE
            gic <- gic.red
            size <- length(gic)

            if(trace)
                cat(" -- end {repeat} nr. ", nFwd,
                    ". gic enlarged and reduced from ", length(gic.old),
                    " to ", size, "\n", sep="")

            if (nFwd > 1 && length(gic.old) == size && all(gic.old == gic))
                break
            nFwd <- nFwd + 1

        }## end{repeat}

        gList[[i]] <- gic
        steps [i] <- nFwd
        mn.x [[i]] <-
            if(size) sapply(1:size,
                            function(j) rowMeans(x[, gic[1:j], drop=FALSE]))
            else integer(0)

	if(trace)
	    p.1clust(i, gic = gic, x.mean = mn.x[[i]], y = y, nFwd = nFwd)

    }## end for i = 1:noc

    ## Restoring the original order of output and response
    y <- y[io]
    for (i in 1:length(mn.x)) mn.x[[i]] <- mn.x[[i]][io,,drop=FALSE]

    ## Output
    r <- list(clist = gList,
              steps = steps, y = y, x.means = mn.x, noc = noc, signs=signs)
    class(r) <- "wilma" # cluster List
    r
}

## Short overview of what has been done by Wilma
print.wilma <- function(x, ...)
  {
    ## The number of clusters which were fitted
    noc <- x$noc

    ## Final score and margin values
    fvals <- fitted(x)
    s     <- apply(fvals, 2, score, x$y)
    m     <- apply(fvals, 2, margin, x$y)

    ## Formatting the numbers
    nGenes <- unlist(lapply(x$clist,length))
    cScore <- format(s)
    cMargi <- format(round(m,2))
    cG     <- format(nGenes)
    cI     <- format(1:noc)

    ## General overview
    cat("\nWilma called to fit ", x$noc, " cluster", if(noc>1) "s",
        "\n\n", sep="")

    ## Information for each of the predictors
    for (i in 1:noc)
      {
	gic <- x$clist[[i]]
        ng <- length(gic)
        cat("Cluster", cI[i], ": Contains ")
        cat(cG[i], " gene", if(ng != 1)"s", ", final score ", cScore[i],
            ", final margin ", cMargi[i], "\n", sep="")
      }
  }


## The next 3 functions provide a detailed overview about Wilma's clustering
summary.wilma <- function(object, ...)
  {
    cat("'Wilma' object: ")
    printClist(object, ...)
  }

## Auxiliary function for summary.wilma()
printClist <- function(x, ...)
  {
    ## Purpose: print (Method) "gene list" objects, called from print.multiclust
    ## ----------------------------------------------------------------------
    ## Author: Martin Maechler, Date: 21 Jun 2003, 21:38

    if(is.null(gic <- x$clist))
        stop("invalid 'x' argument")
    noc <- length(gic)
    cat("number of clusters 'noc' =", noc,"\n")
    for(i in 1:noc)
        p.1clust(i, gic = gic[[i]],
                 x.mean = x$x.means[[i]], y = x$y, nFwd = x$steps[i])
    invisible(x)
  }

## Another auxiliary function for summary.wilma()
p.1clust <- function(i, gic, x.mean, y, nFwd)
  {
    cat("\nFinal Cluster", i,
        "\n----------------\n\n")
    size <- length(gic)
    if(size >= 1)
        for (j in 1:size) {
            s <- score (x.mean[,j], y)
            m <- margin(x.mean[,j], y)
            .p.1gen(j=NULL, id = gic[j], s, m, digits = 3)
        }
    else cat(" __empty__ \n")
    invisible()
  }

.p.1gen <- function(j, id, s, m, digits = 3)
  {
    cat("Gen",
        if(is.null(j)) "" else paste0("[", j, "]",if(j <= 9)" "),
        ":", formatC(id, width=6),
        "  Score:", formatC(s, width=4),
        "  Margin:",
        formatC(format(c(round(m, digits=digits),0.735))[1], width=7, digits=3),
        "\n", sep="")
  }



## Returns the fitted values
fitted.wilma <- function(object, ...)
  {
    ## Fitted values
    out <- NULL
    for (i in 1:object$noc)
      {
        out <- cbind(out, (object$x.means[[i]])[,ncol(object$x.means[[i]])])
      }

    if (is.null(vNames <- colnames(out))) ## Naming the predictors
        vNames <- paste("Predictor", 1:object$noc)
    dimnames(out) <- list(1:length(object$y), vNames)
    out
}

## 2-dimensional projection of Wilma's output
plot.wilma <- function(x, xlab = NULL, ylab = NULL, col = seq(x$yvals),
                       main = "2-Dimensional Projection of Wilma's Output", ...)
  {
    ## Fitted values
    fvals <- fitted(x)

    ## If only 1 cluster was fitted
    if(x$noc <= 1)
      {
        if(is.null(xlab)) xlab <- "Mean Expression of Wilma's Cluster 1"
        if(is.null(ylab)) ylab <- "Class label"
        plot(fvals[,1], x$y, type = "n", xlab=xlab, ylab=ylab, main=main, ...)
        text(fvals[,1], x$y, x$y, col = col[1 + x$y])
        return()
      }

    ## For 2 clusters and more
    if(is.null(xlab)) xlab <- "Mean Expression of Wilma's Cluster 1"
    if(is.null(ylab)) ylab <- "Mean Expression of Wilma's Cluster 2"
    plot(fvals[,1], fvals[,2], type="n", xlab=xlab, ylab=ylab, main=main, ...)
    text(fvals[,1], fvals[,2], x$y, col=col[1 + x$y])
    invisible()
}


predict.wilma <- function(object, newdata = NULL, type = c("fitted", "class"),
                          classifier = c("nnr", "dlda", "logreg", "aggtrees"),
                          noc = object$noc, ...)
  {
    ## Checking the input
    type       <- match.arg(type)
    classifier <- match.arg(classifier)

    ## Return fitted values for the training data
    if (is.null(newdata))
      {
        if (type == "fitted") {
          if (length(noc)==1 && noc>object$noc)
          stop("You cannot predict with more predictors than you have fitted")

          if (length(noc)==1 && noc<=object$noc)
          return(fitted(object))

          if (length(noc)>1 & max(noc)>object$noc)
          stop("You cannot predict with more predictors than you have fitted")

          if (length(noc)>1 & max(noc)<=object$noc)
          return(fitted(object)[, noc, drop = FALSE])
        }

        if (type == "class") {
          if (length(noc)==1 && noc>object$noc)
          stop("You cannot predict with more predictors than you have fitted")

          if (length(noc)==1 && noc<=object$noc)
            {
              xlearn <- (fitted(object))[,1:noc, drop = FALSE]
              xtest  <- (fitted(object))[,1:noc, drop = FALSE]
              return(switch(classifier,
                            nnr      = nnr(xlearn, xtest, object$y),
                            dlda     = dlda(xlearn, xtest, object$y),
                            logreg   = logreg(xlearn, xtest, object$y),
                            aggtrees = aggtrees(xlearn, xtest, object$y)))
            }

          if (length(noc)>1 & max(noc)>object$noc)
          stop("You cannot predict with more predictors than you have fitted")

          if (length(noc)>1 && max(noc)<=object$noc)
            {
              clmt <- NULL
              for (i in 1:length(noc))
                {
                  xl <- (fitted(object))[, 1:(noc[i]), drop = FALSE]
                  xt <- (fitted(object))[, 1:(noc[i]), drop = FALSE]
                  cl <- switch(classifier,
                               nnr      = nnr(xl, xt, object$y),
                               dlda     = dlda(xl, xt, object$y),
                               logreg   = logreg(xl, xt, object$y),
                               aggtrees = aggtrees(xl, xt, object$y))
                  cl           <- matrix(cl, ncol=1)
                  dimnames(cl) <- list(1:nrow(xl), paste(noc[i],"Predictors"))
                  clmt         <- cbind(clmt, cl)
                }
              return(clmt)
            }
        }
      }

    ## Returning fitted values or class labels for test data
    else
      {
        ## Check the dimensions of the new data
        if (!is.null(object$signs) && ncol(newdata)!=length(object$signs))
          {
            stop(paste("The new data need to have the same number of",
                       "predictor variables as the training data"))
          }

        ## Flip the signs of the new data
        if (!is.null(object$signs))
          {
            newdata <- t(t(newdata)*object$signs)
          }

        ## Fitted values for the new data
        xtest <- NULL
        for (j in 1:object$noc)
          {
            xtest <- cbind(xtest, rowMeans(newdata[,object$clist[[j]],
                                                   drop = FALSE]))
          }

        ## Naming the predictors and samples
        sampnames <- 1:nrow(newdata)
        clusnames <- character(object$noc)
        for(i in 1:object$noc)      clusnames[i] <- paste("Predictor", i)
        dimnames(xtest) <- list(sampnames, clusnames)

        ## Return the fitted values for the new data if requested
        if (type == "fitted") {
          if (length(noc)==1 && noc>object$noc)
          stop("You cannot predict with more predictors than you have fitted")

          if (length(noc)==1 && noc<=object$noc)
          return(xtest[,1:noc, drop = FALSE])

          if (length(noc)>1 & max(noc)>object$noc)
          stop("You cannot predict with more predictors than you have fitted")

          if (length(noc)>1 & max(noc)<=object$noc)
          return(xtest[,noc, drop = FALSE])
        }

        ## Do 0/1-classification
        if (length(noc)==1 && noc>object$noc)
            stop("You cannot predict with more predictors than you have fitted")

        if (length(noc)==1 && noc==object$noc)
          {
            xlearn <- fitted(object)
            return(switch(classifier,
                          nnr      = nnr(xlearn, xtest, object$y),
                          dlda     = dlda(xlearn, xtest, object$y),
                          logreg   = logreg(xlearn, xtest, object$y),
                          aggtrees = aggtrees(xlearn, xtest, object$y)))
          }

        if (length(noc)==1 && noc<object$noc)
          {
            xlearn <- (fitted(object))[, 1:noc, drop = FALSE]
            xtest  <- xtest[, 1:noc, drop = FALSE]
            return(switch(classifier,
                          nnr      = nnr(xlearn, xtest, object$y),
                          dlda     = dlda(xlearn, xtest, object$y),
                          logreg   = logreg(xlearn, xtest, object$y),
                          aggtrees = aggtrees(xlearn, xtest, object$y)))
          }

        if (length(noc)>1 && max(noc)>object$noc)
            stop("You cannot predict with more predictors than you have fitted")

        if (length(noc)>1 && max(noc)<=object$noc)
          {
            clmt <- NULL
            for (i in 1:length(noc))
              {
                xl <- (fitted(object))[, 1:(noc[i]), drop = FALSE]
                xt <- xtest[, 1:(noc[i]), drop = FALSE]
                cl <- switch(classifier,
                             nnr      = nnr(xl, xt, object$y),
                             dlda     = dlda(xl, xt, object$y),
                             logreg   = logreg(xl, xt, object$y),
                             aggtrees = aggtrees(xl, xt, object$y))
                cl           <- matrix(cl, ncol=1)
                dimnames(cl) <- list(1:nrow(newdata),paste(noc[i],"Predictors"))
                clmt         <- cbind(clmt, cl)
              }
            return(clmt)
          }
      }
  }

Try the supclust package in your browser

Any scripts or data that you put into this service are public.

supclust documentation built on Sept. 27, 2021, 5:11 p.m.