netresponse: Functional Network Analysis

#' @title ICMg.combined.sampler
#' 
#' @description Main function of the ICMg algorithm.
#' ICMg.combined.sampler computes samples
#' from the posterior of the assignments of datapoints (interactions and
#' expression profiles) to latent components. From these we can then obtain
#' component membership distributions and clusterings for genes.
#'
#' @details One run consists of two parts, during burnin the sampler is
#' expected to mix,
#' after which the samples are taken. Information about convergence (convN and
#' convL are estimates of convergence for link and node sampling, respectively)
#' and component sizes are printed after each burnin/sample round. For example:
#' B.num=8, B.size=100, S.num=20, S.size=10, runs 800 burnin iterations in 8
#' rounds and then takes 20 samples with an interval of 10 iterations.
#' 
#' @param L N x 2 matrix of link endpoints (N = number of links).
#' @param X M x D matrix of gene expression profiles (M = number of nodes, D =
#' number of observations).
#' @param C Number of components.
#' @param alpha Hyperparameter describing the global distribution over
#' components, larger alpha gives a more uniform distribution.
#' @param beta Hyperparameter describing the component-wise distributions over
#' nodes, larger beta gives a more uniform distribution.
#' @param pm0 Hyperparameter describing the prior mean of the expression
#' profiles, should be zero.
#' @param V0 Hyperparameter describing the variation of the component-wise
#' expression profiles means around pm0.
#' @param V Hyperparameter describing the variation of gene-specific expression
#' profiles around the component-wise means.
#' @param B.num Number of burnin rounds.*
#' @param B.size Size of one burnin round.*
#' @param S.num Number of sample rounds.*
#' @param S.size Size of one sample round.*
#' @param C.boost Set to 1 to use faster iteration with C, set to 0 to use
#' slower R functions.
#' @return Returns samples as a list: \item{z}{ S.num x N matrix of samples of
#' component assignments for links. } \item{w}{ S.num x M matrix of samples of
#' component assignments for gene expression profiles. } \item{convl}{ Vector
#' of length (B.num + S.num) with convergence estimator values for link
#' sampling. } \item{convn}{ Vector of length (B.num + S.num) with convergence
#' estimator values for node sampling. } \item{countsl}{ (B.num + S.num) x C
#' matrix of link component sizes. } \item{countsn}{ (B.num + S.num) x C matrix
#' of node component sizes. } additionally all parameters of the run are
#' included in the list.
#' @author Juuso Parkkinen
#' @seealso \code{\link{ICMg.links.sampler}}
#' @references Parkkinen, J. and Kaski, S. Searching for functional gene
#' modules with interaction component models. BMC Systems Biology 4 (2010), 4.
#' @keywords methods
#' @export
#' @examples
#'   data(osmo) # Load data set
#'   res <- ICMg.combined.sampler(osmo$ppi, osmo$exp, C=10) 
#' 
ICMg.combined.sampler <- function(L, X, C, alpha=10, beta=0.01, pm0=0, V0=1, V=0.1, B.num=8, B.size=100, S.num=20, S.size=10, C.boost=1) {
# Sample posterior assignments of data points to the latent components, based on both network and functional data

  message("Sampling ICMg2...\n")
  ## L is Nx2 matrix of link endpoints
  ## X is MXD matrix of node data (real value vectors)
  ## C is the number of latent components
  ## alpha and beta are the hyperparameters
  ## pm0, V0 and V are the node data parameters

  ## B.size is the size of one burnin round,
  ## after which convergence and component counts are reported
  ## B.num is the number of these burnin rounds
  ## S size is likewise the size of one sample round, after which samples are taken
  ## S.num is the number of these sample rounds
  ## C.boost = 1 uses faster C functions for sampling

  N <- dim(L)[1]                # The number of links
  M <- dim(X)[1]                # The number of nodes
  D <- dim(X)[2]                # The number of observations in the expression data

  message("nodes:",N,"links:",M,"observations:",D,"components:",C,"alpha:",alpha,"beta:",beta,"\n")
  
  out <- list()
  out$z <- vector("integer", N)    # link l is generated by component z[l]
  out$w <- vector("integer", M)    # node k is generated by component w[k]
  out$n <- vector("integer", C)    # n[z] links generated by component z
  out$m <- vector("integer", C)    # m[w] nodes generated by component w
  out$q <- matrix(0, C, M)       # q[z,i] links generated by z end at i, {{and + 1, if node k generated by z}}
  out$E <- matrix(0, C, D) # \sum_k^{m_w} x_k Sum of datas generated by component w

  ## Initialize vectors randomly
  out$z <- sample(1:C, N, replace=TRUE)
  out$w <- sample(1:C, M, replace=TRUE)
  out$n <- as.vector(table(factor(out$z, levels=1:C)))
  out$m <- as.vector(table(factor(out$w, levels=1:C)))

  ## Initialize k and E according to z and w
  for (l in 1:N) {
    out$q[out$z[l], L[l,1]] <- out$q[out$z[l], L[l,1]]+1
    out$q[out$z[l], L[l,2]] <- out$q[out$z[l], L[l,2]]+1
  }
  for (k in 1:M) {
    out$q[out$w[k],k] <- out$q[out$w[k],k] +1
    out$E[out$w[k],] <- out$E[out$w[k], ] + X[k, ]
  }

  ## Initialize C random number generator
  ICMg.randominit()

  ## Initialize some necessary data structures
  samples <- list(z=matrix(0, S.num, N), w=matrix(0, S.num, M),
                  convl=vector("numeric", B.num+S.num),
                  convn=vector("numeric", B.num+S.num),
                  countsl=matrix(0,B.num+S.num,C),
                  countsn=matrix(0,B.num+S.num,C))
  convl <- vector("numeric", N)
  convn <- vector("numeric", M)
  
  message("Sampling", B.num*B.size + S.num*S.size,"iterationcs\n")
  
  ## Burnin
  message("Burnin iterations:",B.num*B.size,"\n")
  message("I: 0\n")
  message("n(z):",out$n,"\n")
  message("m(z):",out$m,"\n")

  for (b in 1:B.num) {

    ## Randomize links and nodes
    Lindices <- sample(N)
    Nindices <- sample(M)   
    ## Sample components for links and nodes
    out <- ICMg.combined.wrapper(L, X, B.size, N, M, Lindices, Nindices,
                         D, C, out$z, out$w, out$n, out$m, out$q, out$E,
                         alpha, beta, pm0, V0, V, convl, convn, C.boost)
    ## Save and print convergence monitoring and component counts
    samples$convl[b] <- mean(log(out$convl))
    samples$countsl[b,] <- out$n
    samples$convn[b] <- mean(log(out$convn))
    samples$countsn[b,] <- out$m
    message("I:",b*B.size,"\n")
    message("convL:",samples$convl[b],"n(z):",samples$countsl[b,],"\n")
    message("convN:",samples$convn[b],"m(z):",samples$countsn[b,],"\n")
  }

  ## Samples
  message("Sample iterations:",S.num*S.size,"\n")
  for (s in 1:S.num) {

    ## Randomize links and nodes
    Lindices <- sample(N)
    Nindices <- sample(M) 
    ## Sample components for links and nodes
    out <- ICMg.combined.wrapper(L, X, B.size, N, M, Lindices, Nindices,
                         D, C, out$z, out$w, out$n, out$m, out$q, out$E,
                         alpha, beta, pm0, V0, V, convl, convn, C.boost)
    ## Save and print convergence monitoring and component counts
    samples$convl[B.num+s] <- mean(log(out$convl))
    samples$countsl[B.num+s,] <- out$n
    samples$convn[B.num+s] <- mean(log(out$convn))
    samples$countsn[B.num+s,] <- out$m
    message("I:",B.num*B.size+s*S.size,"\n")
    message("convL:",samples$convl[B.num+s],"n(z):",samples$countsl[B.num+s,],"\n")
    message("convN:",samples$convn[B.num+s],"m(z):",samples$countsn[B.num+s,],"\n")
    ## Take samples
    samples$z[s,] <- out$z
    samples$w[s,] <- out$w
  }

  ## Save parameters
  samples$B.num <- B.num
  samples$B.size <- B.size
  samples$S.num <- S.num
  samples$S.size <- S.size
  samples$alpha <- alpha
  samples$beta <- beta
  samples$pm0 <- pm0
  samples$V0 <- V0
  samples$V <- V

  message("DONE\n")
  return(samples)
}



#' @title ICMg.links.sampler
#' 
#' @description ICMg.links.sampler computes samples from the posterior
#' of the assignments of
#' datapoints (interactions) to latent components. From these we can then
#' obtain component membership distributions and clusterings for genes.
#' 
#' @details One run consists of two parts, during burnin the sampler
#' is expected to mix,
#' after which the samples are taken. Information about convergence (convN and
#' convL are estimates of convergence for link and node sampling, respectively)
#' and component sizes are printed after each burnin/sample round. For example:
#' B.num=8, B.size=100, S.num=20, S.size=10, runs 800 burnin iterations in 8
#' rounds and then takes 20 samples with an interval of 10 iterations.
#' 
#' @usage ICMg.links.sampler(L, C, alpha=10, beta=0.01, B.num=8, B.size=100,
#' S.num=20, S.size=10, C.boost=1)
#' @param L N x 2 matrix of link endpoints (N = number of links).
#' @param C Number of components.
#' @param alpha Hyperparameter describing the global distribution over
#' components, larger alpha gives a more uniform distribution.
#' @param beta Hyperparameter describing the component-wise distributions over
#' nodes, larger beta gives a more uniform distribution.
#' @param B.num Number of burnin rounds.*
#' @param B.size Size of one burnin round.*
#' @param S.num Number of sample rounds.*
#' @param S.size Size of one sample round.*
#' @param C.boost Set to 1 to use faster iteration with C, set to 0 to use
#' slower R functions.
#' @return Returns samples as a list: \item{z}{ S.num x N matrix of samples of
#' component assignments for links. } \item{conv}{ Vector of length (B.num +
#' S.num) with convergence estimator values for link sampling. } \item{counts}{
#' (B.num + S.num) x C matrix of link component sizes. } additionally all
#' parameters of the run are included in the list.
#' @author Juuso Parkkinen
#' @seealso \code{\link{ICMg.combined.sampler}}
#' @references Parkkinen, J. and Kaski, S. Searching for functional gene
#' modules with interaction component models. BMC Systems Biology 4 (2010), 4.
#' @keywords methods
#' @export
#' @examples
#' data(osmo) # Load data
#' 
#' ## Run ICMg links sampler	
#' res <- ICMg.links.sampler(osmo$ppi, C=10) 
#' 
ICMg.links.sampler <- function(L, C, alpha=10, beta=0.01,  B.num=8, B.size=100, S.num=20, S.size=10, C.boost=1) {
# Sample posterior assignments of data points to the latent components, based on network data only

  message("Sampling ICM...\n")
  ## L is Nx2 matrix of link endpoints
  ## C is the number of latent component
  ## alpha and beta are the hyperparameters
  
  ## B.size is the size of one burnin round, after which convergence and component counts are reported
  ## B.num is the number of these burnin rounds
  ## S size is likewise the size of one sample round, after which samples are taken
  ## S.num is the number of these sample rounds
  
  Nlinks <- dim(L)[1]
  Nnodes <- max(L)

  message("nodes:",Nnodes,"links:",Nlinks,"components:",C,"alpha:",alpha,"beta:",beta,"\n")
  
  out <- list()
  out$z <- vector("integer", Nlinks)     # link l is generated by z[l]
  out$n <- vector("integer", C) # n[z] links generated by component z
  out$q <- matrix(0, C, Nnodes) # k[z,i] links generated by z end at i

  ## Start from random initialization  
  out$z <- sample(1:C, Nlinks, replace=TRUE)
  out$n <- as.vector(table(factor(out$z, levels=1:C)))
  
  for (l in 1:Nlinks) {
    out$q[out$z[l], L[l,1]] <- out$q[out$z[l], L[l,1]]+1
    out$q[out$z[l], L[l,2]] <- out$q[out$z[l], L[l,2]]+1
  }

  ## Initialize random number generator
  ICMg.randominit()

  ## Initialize some necessary data structures
  samples <- list(z=matrix(0, S.num, Nlinks), conv=vector("numeric", B.num+S.num),
                  counts=matrix(0,B.num+S.num,C))
  conv <- vector("numeric", Nlinks)
  
  message("Sampling", B.num*B.size + S.num*S.size,"iterations\n")
  
  ## Do burnin and monitor convergence
  message("Burnin iterations:",B.num*B.size,"\n")
  message("I: 0 n(z):",out$n,"\n")
  for (b in 1:B.num) {
    
    Lindices <- sample(Nlinks)
    out <- ICMg.links.wrapper(L, B.size, Nlinks, Nnodes, Lindices, C,
                       out$z, out$q, out$n, alpha, beta, conv, C.boost)
    samples$conv[b] <- mean(log(out$conv))
    samples$counts[b,] <- out$n
    message("I:",b*B.size,"conv:",samples$conv[b],"n(z):",samples$counts[b,],"\n")
  }

  ## After burnin take samples
  message("Sample iterations:",S.num*S.size,"\n")
  for (s in 1:S.num) {

    Lindices <- sample(Nlinks)
    out <- ICMg.links.wrapper(L, S.size, Nlinks, Nnodes, Lindices, C,
                       out$z, out$q, out$n, alpha, beta, conv, C.boost)
    samples$conv[B.num +s] <- mean(log(out$conv))
    samples$counts[B.num+s,] <- out$n
    message("I:",B.num*B.size + s*S.size,"conv:",samples$conv[B.num +s],
        "n(z):",samples$counts[B.num+s,],"\n")
    samples$z[s,] <- out$z
  }

  ## Save parameters
  samples$B.num <- B.num
  samples$B.size <- B.size
  samples$S.num <- S.num
  samples$S.size <- S.size
  samples$alpha <- alpha
  samples$beta <- beta

  message("Sampling finished\n")
  return(samples)
}



#' @title ICMg.get.comp.memberships
#' 
#' @description Function for computing the component memberships for
#' each data point from the MCMC samples.
#' 
#' @usage ICMg.get.comp.memberships(links, samples)
#' @param links N x 2 matrix of link endpoints (N = number of links).
#' @param samples Posterior samples, as given by either ICMg.combined.sampler
#' or ICMg.links.sampler.
#' @return A matrix containing the component memberships for each data point
#' (node).
#' @author Juuso Parkkinen
#' @seealso \code{\link{ICMg.combined.sampler}},
#' \code{\link{ICMg.links.sampler}}
#' @references Parkkinen, J. and Kaski, S. Searching for functional gene
#' modules with interaction component models. BMC Systems Biology 4 (2010), 4.
#' @export
#' @keywords methods
ICMg.get.comp.memberships <- function(links, samples) {

  Nnodes <- max(links)
  Nlinks <- dim(links)[1]
  Ncomps <- max(samples$z)
  Nsamples <- dim(samples$z)[1]

  ## Node proportion matrix (average over counts)
  proportions <- matrix(0, Ncomps, Nnodes)

  for (s in 1:Nsamples) {
 
    counts <- matrix(0, Ncomps, Nnodes)
    ## Counts from link components
    for (n in 1:Nlinks) {

      a <- links[n,1]
      b <- links[n,2]
      c <- samples$z[s,n]
      counts[c,a] <- counts[c,a]+1
      counts[c,b] <- counts[c,b]+1
    }

    ## Counts from node components (if given)
    if (length(samples$w)>0) {
      for (m in 1:Nnodes) {
        c <- samples$w[s,m]
        counts[c,m] <- counts[c,m]+1
      }
    }
    
    ## Compute component-wise average from the samples
    proportions <- proportions + t(t(counts)/apply(counts, 2, sum))
  }
  return(proportions/Nsamples)
}