R/function_library.r
In MOP: Mutation Order Prediction

# loglik function calls C function "loglik" to compute the log-likelihood
loglik <- function(xx, AllPtrue, division, sel, Y, a, aa, base, theta)
{
    storage.mode(xx) <- "double"
    storage.mode(AllPtrue) <- "double"
    for (i in 1:length(division)) {
        storage.mode(division[[i]]) <- "integer"
    }
    storage.mode(sel) <- "integer"
    storage.mode(Y) <- "integer"
    for (i in 1:length(a)) {
        storage.mode(a[[i]]) <- "integer"
    }
    storage.mode(MAX) <- "integer"
    divsel <- division[[sel]]
    storage.mode(divsel) <- "integer"
    # base here should be a list, fix to use for loop
    for (i in 1:length(base)) {
      storage.mode(base[[i]]) <- "integer"
    }
    # storage.mode(theta) <- "double"
    for (i in 1:length(theta)) {
      storage.mode(theta[[i]]) <- "double"
    }

    .Call("loglik", xx, AllPtrue, divsel, Y, a, MAX, base, theta)
}

# gradient function calls C function "gradient" to compute the gradient 
# of log-likelihood with respect to the probability parameters.
gradient <- function(xx, AllPtrue, division, sel, Y, a, aa, base, theta)
{
    storage.mode(xx) <- "double"
    storage.mode(AllPtrue) <- "double"
    for (i in 1:length(division)) {
        storage.mode(division[[i]]) <- "integer"
    }
    storage.mode(sel) <- "integer"
    storage.mode(Y) <- "integer"
    for (i in 1:length(a)) {
        storage.mode(a[[i]]) <- "integer"
    }
    for (i in 1:length(aa)) {
        storage.mode(aa[[i]]) <- "integer"
    }
    storage.mode(MAX) <- "integer"
    divsel <- division[[sel]]
    storage.mode(divsel) <- "integer"
    # base here should be a list, fix to use for loop
    for (i in 1:length(base)) {
      storage.mode(base[[i]]) <- "integer"
    }
    # storage.mode(theta) <- "double"
    for (i in 1:length(theta)) {
      storage.mode(theta[[i]]) <- "double"
    }

    .Call("gradient", xx, AllPtrue, divsel, Y, a, aa, MAX, base, theta)
}

# a vector function specifying inequality constraints such that hin[j] > 0 
# for all j used in the constrained optimization function auglag
hin <- function(x, AllPtrue, division, sel, Y, a, aa, base, theta)     
{
    return(c(x, 1 - sum(x)))
}

# Jacobian of hin, used in the constrained optimization function auglag
hin.jac <- function(x, AllPtrue, division, sel, Y, a, aa, base, theta)   
{
    return(rbind(diag(length(x)), rep(-1, length(x))))
}

# theta.zero returns theta=0 which is essentially same the YounSimon 
# version model
theta.zero <- function(n) 
{
    return(0)
}

# theta.one returns theta=1
theta.one <- function(n) 
{
    return(1)
}

# theta.half returns theta=1/2
theta.half <- function(n) 
{
    return(1/2)
}

# theta.oneovern returns theta=1/n
theta.oneovern <- function(n) 
{
    return(1/n)
}

# theta.oneovernfac returns theta=1/n!
theta.oneovernfac <- function(n) 
{
    return(1/factorial(n))
}

# theta.user function computes the theta in the weights function
# where input thetafun should be user provided
theta.user <- function(thetafun) 
{   
    function(n) {
      thetafun(n)
    }
}

# rtheta function computes the theta in the weights function
rtheta <- function(n, theta.fun = theta.oneovern) 
{
    theta.fun(n)
}

# subloglik function calculates the log of likelihood 
subloglik <- function(P, N, a)
{
    temp <- rep(1, nrow(a[[N]]))
    for(j in 1:ncol(a[[N]])) {
        temp <- temp*P[a[[N]][, j], j]
    }
    
    return(log(sum(temp)))
}


########################################################################

# main.function implements the iterative optimization to obtain the 
# estimate of parameters
main.function <- function(Y, a, aa, thetafun = theta.zero)
{
    # process the mutation allele frequency data
    # Yold <- Y
    threshold <- 0.0
    samp.rm <- which(apply(Y, 1, function(x) sum(x > threshold)) == 0)
    if (length(samp.rm) > 0) {
      Y <- Y[-samp.rm, ]
    }
    base <- list()
    theta <- list()
    for (i in 1:nrow(Y)) {
      indi <- which(Y[i, ] > threshold)
      theta[[i]] <- thetafun(length(indi))
      base[[i]] <- rank(-Y[i, indi], ties.method = "random")
      Y[i, ] <- as.integer(Y[i, ] > threshold)
    }
    nomut <- max(rowSums(Y))
    MAX <- max(rowSums(Y))
    no.gene <- ncol(Y)

    nparam <- min(MAX + 1, nomut)
    division <- vector("list", nparam)
    for (i in 1:(nparam - 1)) division[[i]] <- i
    division[[nparam]] <- nparam:nomut

    AllPtrue <- matrix(0, nrow = ncol(Y), ncol = nomut)

    for (j in 1:nparam) {
        init <- runif(no.gene)
        init <- init / sum(init)
        AllPtrue[, division[[j]]] <- init
    }

    prev0 <- Inf
    prev <- Inf
    n <- 0
    decrease <- 1

    no.repeat <- 20
    decrease.limit <- 10^(-6)

    while (decrease > decrease.limit & n <= no.repeat) {

        n <- n + 1
        prev0 <- prev  

        # For each k, the length of \vec{P_{k}} optimized is N-1 where N 
        # is the number of driver genes. This is because \sum_{i=1}^{N} 
        # P_{k,i} = 1 and the value of P_{k,N} is determeined by the other 
        # N-1 P_{k,i}, thus we optimize only for P_{k,i} for i=1,..., N-1. 
        # We let P_{k,N}=1-\sum_{i=1}^{N-1}{P_{k,i}}
       
        # Find \vec{P_{k}} maximizing the log likelihood for each k in turn 
        # with the constraint that 0 < P_{k, i} for i = 1, ..., N-1 and 0 < 
        # 1 - \sum_{i=1}^{N-1}{P_{k,i}}.
        
        for(sel in 1:nparam) {

          # initial values for \vec{P_{k}}
          init <- runif(no.gene) 
          init <- init / sum(init)  

          # auglag is a constrained optimization function in R                       
          x <- auglag(par = init[-no.gene], fn = loglik, gr = gradient, hin = 
               hin, hin.jac = hin.jac, control.optim = list(maxit = 200), 
               control.outer = list(trace = F), AllPtrue = AllPtrue, division = 
               division, sel = sel, Y = Y, a = a, aa = aa, base = base, theta = 
               theta)
          # adjust the optimal parameter estimate
          tempAllPtrue <- AllPtrue
          nonnegative <- pmax(c(x$par, 1 - sum(x$par)), 0)
          # This is needed since sometimes the value x$par are negative.
          tempAllPtrue[, division[[sel]]] <- nonnegative / sum(nonnegative)   
          # negative log likelihood
          fval <- loglik(tempAllPtrue[-no.gene, 1], tempAllPtrue, division, 1, 
                  Y, a, aa, base, theta)     

          decrease <- prev - fval
          # if the negative log likelihood decreases, then change old P_{k} 
          # with new P_{k}
          if (decrease > 0) {
              prev <- fval
              AllPtrue <- tempAllPtrue
          }
        }
        decrease <- prev0 - prev
    }

    return(list(AllPtrue, prev))
}

# generate a list of permutation matrices by considering the probability 
# parameter contraints
generatea <- function(MAX, MAX.mut)
{
  a <- vector("list", MAX.mut)
  a[[1]] <- matrix(1, nrow = 1, ncol = 1)

  for (N in 2:MAX.mut){
    x <- matrix(1:N, nrow = N)
    for(i in 1:(min(N, MAX) - 1)) {
        y <- rep(1:N, nrow(x))[-as.vector(t(x + ((1:nrow(x)) - 1) * N))]
        x <- t(matrix(as.vector(t(matrix(rep(as.vector(x), N - ncol(x)),nrow = 
             nrow(x)))), nrow = ncol(x)))
        x <- cbind(x, y)
    }
    if (N > MAX) {
        y <- rep(1:N, nrow(x))[-as.vector(t(x + ((1:nrow(x)) - 1) * N))]
        y <- t(matrix(y, ncol = nrow(x)))
        x <- cbind(x, y)
    }
    a[[N]] <- x
  }
  
  return(a)
}

# function to estimate the probability matrix and mutation order of genes
# This version ignores 'MAX'
order_estimate <- function(sample.gene.mutation, N, parallel, MAX=NULL, thetafun = theta.zero)
{

    Y <- sample.gene.mutation
    threshold <- 0.0
    for (i in 1:nrow(Y)) {
      Y[i, ] <- as.integer(Y[i, ] > threshold)
    }
    # MAX <- 4
    MAX <- max(rowSums(Y))
    a <- generatea(MAX, max(rowSums(Y)))
    aa <- generatea(MAX - 1, max(rowSums(Y)))

    ## Run optimization with different inital values N times using parallel 
    ## computing if the variable "parallel" is TRUE :
    if (parallel) {
      tmp <- foreach (kk = 1:N) %dopar%  main.function(sample.gene.mutation, a, aa, thetafun)
    }
    else { ## Otherwise, run for loop
      tmp <- vector("list", N) 
      for(i in 1:N) {
        tmp[[i]] <- main.function(sample.gene.mutation, a, aa, thetafun)
      }
    }

    minusloglik <- rep(Inf, N)
    for(l in 1:N) {
        if(is.list(tmp[[l]])) minusloglik[l] <- tmp[[l]][[2]]
    }

    # Find the one giving the maximum likelihood
    result <- tmp[[which(minusloglik == min(minusloglik))[1]]][[1]] 

    return(result)

}

# Prob that there is any mutation in a gene
total <- function(X) {
    return(1 - apply(1 - X, 1, prod))
}

Intersection <- function(X,N)
{
    sel <- combinations(ncol(X), N)
    res <- 0
    for(i in 1:nrow(sel)) {
        res <- res + apply(X[, sel[i, ]], 1, prod)
    }
    return(res)
}
Union <- function(X)
{
    res <- 0
    for(i in 2:ncol(X)) {
        res <- res + Intersection(X, i) * (-1)^(i + 1)
    }
    return(apply(X, 1, sum) + res)
}

# Constructing BCa CI from bootstrapped data for each of the colon and lung data
ConfInterval <- function(x, mle, jack, sample.gene.mutation, lower, upper)
{
    no.gene <- ncol(sample.gene.mutation)

    # bias-correction constant
    z0 <- matrix(0, nrow = no.gene, ncol = ncol(mle))       

    for(i in 1:no.gene)
    {
        for(j in 1:ncol(mle))
        {
            z0[i,j] <- qnorm(mean(x[i, j, ] <= mle[i, j], na.rm = T))
        }
    }

    z0[z0 == Inf] <- 1000

    theta.mean <- apply(jack, 1:2, mean, na.rm = T)

    num <- denom <- 0

    for(i in 1:nrow(sample.gene.mutation))
    {
        if(sum(is.na(jack[, , i])) == 0)
        {
            num <- num + (theta.mean - jack[, , i])^3
            denom <- denom + (theta.mean - jack[, , i])^2
        }
    }
    accel <- num/6/denom^1.5   # acceleration constant

    alpha1 <- pnorm(z0 + (z0 + qnorm(lower))/(1 - accel*(z0 + qnorm(lower))))
    alpha2 <- pnorm(z0 + (z0 + qnorm(upper))/(1 - accel*(z0 + qnorm(upper))))

    CI <- array(0, dim = c(no.gene, 2, ncol(mle)))

    for(i in 1:no.gene)
    {
        for(j in 1:ncol(mle))
        {
            CI[i, 1, j] <- quantile(x[i, j, ], alpha1[i, j], na.rm = T)
            CI[i, 2, j] <- quantile(x[i, j, ], alpha2[i, j], na.rm = T)
        }
    }
    CI <- round(CI, 2)
    a <- round(mle, 2)

    tab <- NULL
    for(i in 1:ncol(a))
    {
        tab <- cbind(tab, "&", a[, i], "&", "(" , CI[, 1, i], ",", CI[,2,i], ")")
    }
    # for generating tables in the paper
    tab <- cbind(colnames(sample.gene.mutation), tab, "\\")   

    return(tab)
}