R/mppca_core_functions.R

Defines functions MPPCA_one_q_one_g get_mu_g get_w_g initial_wg exp_z_g compute_tau e_uig e_uig_i_g E_uig_uig_i_g E_uig_uig_i E_uig_uig mu_g_est pi_est w_g_hat sigma_aux sigma2_hat_mppca observed_log_lik_mppca Aitken_criterion loadings_std_mppca

#' @importFrom stats cov rnorm cmdscale dist
#' @importFrom mclust mclustBIC summary.mclustBIC summaryMclustBIC
#' @importFrom stats prcomp qnorm sd glm gaussian
#' @importFrom magrittr %>%
#' @importFrom future plan multisession
#' @importFrom future.apply future_lapply
#' @importFrom furrr future_map
#' @importFrom mvtnorm dmvnorm

# MPPCA Method -------------------------------------------------------------

MPPCA_one_q_one_g <- function(data, q, g, max_it = 1000,eps = 0.1, initial.guesses = NULL){

  #Checking if g =1: run PPCA
  if (g ==1){
    PPCA_one_q(data, q=q, eps = eps)
  } else if (g>1) {

    n <- nrow(data)
    p <- ncol(data)

    data_scaled <- scale(data, scale = FALSE) #Scaling the data
    df <- (p*q) - (0.5 *q*(q - 1)) + 1 + (g-1)

    ## Initialize

    ll_vector = c()
    iter <- 1
    conv <- FALSE

    ##  Initial guesses

    if(is.null(initial.guesses) == FALSE){

      w_g = initial.guesses$w_g
      mu_g = initial.guesses$mu_g
      pi = initial.guesses$pi
      Sig = initial.guesses$Sig

    }
    if (is.null(initial.guesses)==TRUE) {

      #Estimate groups using packeage mclust
      m<-mclust::mclustBIC(cmdscale(dist(data_scaled)), G=g, verbose = FALSE)
      res<-summary.mclustBIC(m, cmdscale(dist(data_scaled)))
      tau<-res$z                                             #Probability of each obs belonging to each group
      pi<-res$parameters$pro                                 #Initial guess for membership probabilities

      #Make the groups with the estimated tau:
      colnames(tau) = seq(1,g,1)
      data_scaled = as.data.frame(data_scaled)
      data_scaled$groups_init = as.numeric(colnames(tau)[max.col(tau,ties.method="first")])
      data= as.data.frame(data)
      data$groups_init = as.numeric(colnames(tau)[max.col(tau,ties.method="first")])

      #Get column means for each of the groups
      mu_g  = sapply(X= seq(1,g,1), FUN = get_mu_g, data= data_scaled)

      #Perform a PPCA in each initial group and get initial W using PPCA

      if(g ==1){
        ppca =  PPCA_one_q(data = data, q)
        w_g = list(); w_g[[1]] = ppca$loadings; out[[2]] = ppca$sigma2;
      } else if (g > 1){

        ori_plan <- plan()
        #plan(multisession)

        x= seq(1,g,1)
        w_g <- try(future_lapply(x, function(x) {
          get_w_g(data, g = x,q)
        }), silent=TRUE)


        #Perform a different option of initial guesses if PPCA fails
        if(class(w_g) == "try-error"){
          w_g2 = sapply(seq(1,g,1), initial_wg, q, data_scaled, tau, simplify = FALSE)
          w_g <- w_g2
        }

      }

      #Sum all second elements of the list
      Sig = sum(unlist(lapply(w_g, "[[", 2)))/g

      #Exclude elements sigma from list
      w_g = lapply(w_g, function(list) { list = list[[1]]})

      data_scaled$groups_init = NULL; data$groups_init =NULL

    }

    #           CYCLE 1

    while(conv==FALSE  & iter <= max_it){

      #     Estep 1

      tau = compute_tau(data_scaled, mu_g, pi, Sig, w_g)

      #     Mstep 1

      #update MU
      mu_g_new =sapply(seq(1,g,1),mu_g_est,data_scaled, tau)

      #update PI (probabilities of membership)
      pi_new =sapply(seq(1,g,1), pi_est,data_scaled,tau)

      ## CYCLE 2

      ## Estep 2

      #1. Recompute tau with new mu and pi
      tau = compute_tau(data_scaled,mu_g_new, pi_new, Sig, w_g)

      #2. Compute E(uig)
      e_uig_new = e_uig(w_g, mu_g_new, Sig, data_scaled)

      #Compute E[uit uitT]
      E_uu = E_uig_uig(w_g, Sig, e_uig_new,n)

      ##    Mstep 2

      #Calculate new W
      w_g_new = sapply(seq(1,g,1), w_g_hat, data_scaled, Sig, tau, w_g, mu_g_new)

      #Calculate sigma2
      Sig_new = sigma2_hat_mppca(data_scaled,mu_g = mu_g_new, w_g_new, w_g, Sig,pi = pi_new,tau)

      ##   Assess convergence
      ll = observed_log_lik_mppca(data_scaled, mu_g_new, w_g_new, Sig_new, pi_new, tau)
      ll_vector = c(ll_vector, ll)

      #Perform aitken calculation
      conv = ifelse(length(ll_vector) >= 4, Aitken_criterion(ll_vector, iter) <= eps, FALSE)

      mu_g = mu_g_new;
      w_g = w_g_new;
      Sig = Sig_new;
      pi = pi_new;

      iter <- iter + 1
    }

    bic <- bic_value(ll, df, n)  #Check if this is supposed to be the observed loglik

    # Rename loadings data structure
    names(w_g_new) <- c(paste0("Group", 1:g)) # Rename list names

    clusters <- data.frame(matrix(max.col(tau), nrow=n))
    colnames(clusters) <- "Group"

    # Reorganize score data structure (full score)
    score.new <- c()
    for(group_n in 1:g){
      temp <- c()
      for(obs_n in 1:n) {
        temp <- cbind(temp, e_uig_new[[obs_n]][,group_n])
      }
      rownames(temp) <- c(paste0("PC", 1:q)) # Rename row names
      colnames(temp) <- rownames(data) # Rename col names
      score.new[[group_n]] <- temp
    }
    names(score.new) <- c(paste0("Group", 1:g)) # Rename list names

    score_specific <- c()
    for(cluster_n in 1:g) {
      score_specific[[cluster_n]] <- score.new[[cluster_n]][,unlist(clusters, use.names = FALSE)==cluster_n]

      if(q==1) { # Properly name when q = 1 only
        score_specific[[cluster_n]] <- matrix(score_specific[[cluster_n]], ncol = length(score_specific[[cluster_n]]), dimnames = list("PC1")) # Rename row names
      }
    }
    names(score_specific) <- c(paste0("Group", 1:g)) # Rename list names

    # Posterior score variance
    if(q==1){
      score_var <- list()
    }
    else{
      score_var <- c()
    }
    for(group_n in 1:g) {
      score_var[[group_n]] <- Sig_new * solve((t(w_g_new[[group_n]]) %*% w_g_new[[group_n]]) +
                                                   (Sig_new * diag(q)))
    }

    names(score_var) <- c(paste0("Group", 1:g)) # Rename row names

    score <- list(score = score_specific, score_var = score_var)

    class(score) <- c("MPPCA_score")
    class(w_g_new) <- c("MPPCA_loadings")

    output <- list(sigma2 =Sig_new,
                   score = score,
                   clustering = clusters,
                   loadings = w_g_new,
                   pi = pi_new,
                   mu = mu_g_new,
                   bic = bic)
    return(output)
  }

}

# Initial guesses  --------------------------------------------------------

get_mu_g = function(data, g){
  p = ncol(data) - 1
  data_g = subset(data, data$groups_init == g); data_g$groups_init = NULL
  mu_g = colMeans(data_g); out = list(); out[[1]] = mu_g[1:p];
  return(out)
}


get_w_g = function(data, g, q){
  p = ncol(data)-1
  data_g = subset(data, data$groups_init == g); data_g$groups_init = NULL
  ppca = PPCA_one_q(data = data_g, q = q)
  w_g = ppca$loadings ; w_g = as.matrix(w_g, ncol = q)
  sig = ppca$sigma2
  out = list(); out[[1]] = w_g; out[[2]] = sig;
  return(out)
}

initial_wg = function(g,q, data_scaled, tau){
  data_g = subset(data_scaled, data_scaled$groups_init == g ); data_g$groups_init = NULL
  p = ncol(data_g);
  S <- cov(data_g);
  decomp<-eigen(S)
  w_g <-decomp$vec[,1:q]
  sig_g<- abs(mean(decomp$values[(q+1):p]))

  if(sig_g<0.0001){sig_g<-abs(rnorm(1,0.01, 0.001))}
  out = list(); out[[1]] = w_g; out[[2]] = sig_g;
  return(out)}
# E-step  --------------------------------------------------------

exp_z_g = function(g,data_scaled,mu_g,pi,Sig, w_g){
  #This function calculates for a specific group g.
  #g: group index

  p = ifelse( is.null(dim(w_g[[g]]))==TRUE, length(w_g[[1]]),  dim(w_g[[g]])[1])
  q = ifelse( is.null(dim(w_g[[g]]))==TRUE, 1,  dim(w_g[[g]])[2])

  res = dmvnorm(data_scaled,mu_g[[g]], w_g[[g]]%*%t(w_g[[g]])+ Sig*diag(p),log=TRUE) + log(pi[g])
  res = exp(res)
  res[res == Inf] <- 1
  res[res == -Inf] <- -1
  res = matrix(res, ncol=1); return(res)
}


compute_tau = function(data_scaled, mu_g, pi, Sig, w_g){
  #Apply the previous function to all groups
  p = ifelse( is.null(dim(w_g[[1]]))==TRUE, length(w_g[[1]]),  dim(w_g[[1]])[1])
  q = ifelse( is.null(dim(w_g[[1]]))==TRUE, 1,  dim(w_g[[1]])[2])
  g = length(w_g)

  z = sapply(X = seq(1,g,1), exp_z_g, data_scaled, mu_g, pi,Sig, w_g)
  #Normalize by the sum
  tau<-z/apply(z,1,sum); colnames(tau) = seq(1,g,1)
  tau[is.nan(tau)] <- 1 #corrects numerical issues
  return(tau)
}

e_uig = function(w_g, mu_g, Sig, data_scaled){

  e_uig_i = function(index, w_g, mu_g, Sig,data_scaled){

    q = ifelse( is.null(dim(w_g[[1]]))==TRUE, 1,  dim(w_g[[1]])[2])
    ind = sapply(seq(1,length(w_g),1),e_uig_i_g, index, w_g,mu_g, Sig, data_scaled)

    ind = matrix(ind, nrow = q)
    out = list(); out[[1]] <- ind
    return(out)
  }

  n = nrow(data_scaled)
  all = sapply(seq(1,n,1),e_uig_i, w_g,mu_g, Sig, data_scaled)
  return(all)
}

e_uig_i_g = function(g,index,w_g, mu_g, Sig, data_scaled){

  q = ifelse( is.null(dim(w_g[[g]]))==TRUE, 1,  dim(w_g[[g]])[2])

  data_scaled = as.matrix(data_scaled)
  x_minus_mu = data_scaled[index,] - mu_g[[g]]
  x_minus_mu = matrix(x_minus_mu, nrow = 1)
  M = t(w_g[[g]])%*%w_g[[g]] + Sig*diag(q)
  e = t((solve(M)%*%t(w_g[[g]]))%*%t(x_minus_mu))
  e = matrix(e, nrow = q)
  return(e)
}


E_uig_uig_i_g = function(index, g, w_g, Sig, e_uig){
  q = ifelse( is.null(dim(w_g[[g]]))==TRUE, 1,  dim(w_g[[g]])[2])

  M = t(w_g[[g]])%*%w_g[[g]] + Sig*diag(q)               #DOUBT
  res = Sig*solve(M) + e_uig[[index]]%*%t(e_uig[[index]])
  out=list(); out[[1]] <- res
  return(out)
}

E_uig_uig_i = function(index, w_g, Sig, e_uig){
  g = as.numeric(length(w_g))
  ind = sapply(seq(1,g,1),E_uig_uig_i_g, index=index, w_g, Sig, e_uig)
  out = list(); out[[1]] <- ind
  return(out)
}

E_uig_uig = function(w_g, Sig, e_uig,n){
  all = sapply(seq(1,n,1),E_uig_uig_i, w_g, Sig, e_uig)
  return(all)
}

# M-step  --------------------------------------------------------

mu_g_est = function(g,data_scaled,tau){
  mu_new = apply(data_scaled*tau[,g],2,sum)/sum(tau[,g])
  out = list(); out[[1]] = mu_new; return(out)
}

pi_est = function(g,data_scaled, tau){
  pi = sum(tau[,g])/nrow(data_scaled)
}

w_g_hat = function(g, data_scaled, Sig, tau, w_g, mu_g){

  p = ifelse( is.null(dim(w_g[[g]]))==TRUE, length(w_g[[g]]),  dim(w_g[[g]])[1])
  q = ifelse( is.null(dim(w_g[[g]]))==TRUE, 1,  dim(w_g[[g]])[2])

  data_g<-sweep(data_scaled,2,mu_g[[g]],"-")
  data_g = as.matrix(data_g)

  M<-(t(w_g[[g]])%*%w_g[[g]]) + (Sig)*diag(q)
  S_g = (t(data_g*tau[,g])%*%(data_g*tau[,g]))/sum(tau[,g])

  aux = Sig*diag(q) + solve(M)%*%t(w_g[[g]])%*%S_g%*%w_g[[g]]
  w_new = S_g%*%w_g[[g]]%*%solve(aux)

  out=list(); out[[1]] <- w_new;
  return(out)

}

sigma_aux = function(g, data_scaled, mu_g, w_g_new, w_g, Sig, pi, tau){

  p = ifelse( is.null(dim(w_g[[g]]))==TRUE, length(w_g[[g]]),  dim(w_g[[g]])[1])
  q = ifelse( is.null(dim(w_g[[g]]))==TRUE, 1,  dim(w_g[[g]])[2])

  data_g<-sweep(data_scaled,2,mu_g[[g]],"-")
  data_g = as.matrix(data_g)

  M<-(t(w_g_new[[g]])%*%w_g_new[[g]]) + (Sig)*diag(q)
  S_g = (t(data_g*tau[,g])%*%(data_g*tau[,g]))/sum(tau[,g])

  aux = S_g%*%w_g_new[[g]]%*%solve(M)%*%t(w_g[[g]])

  res = pi[g]*(tr(S_g) - tr(aux))
  return(res)

}

sigma2_hat_mppca= function(data_scaled, mu_g, w_g_new, w_g, Sig, pi, tau){
  p = dim(w_g[[1]])[1]; g = as.numeric(length(w_g))
  terms = sapply(seq(1,g,1),sigma_aux,data_scaled, mu_g, w_g_new, w_g, Sig, pi, tau)
  return(sum(terms)/p)
}

# Results Calculation -----------------------------------------------------

observed_log_lik_mppca = function(data_scaled, mu_g, w_g, Sig, pi, tau){
  g = ncol(tau)

  single_group = function(g,data_scaled, mu_g, w_g, Sig, pi, tau){
    g = as.numeric(length(w_g))
    p = ifelse( is.null(dim(w_g[[g]]))==TRUE, length(w_g[[g]]),  dim(w_g[[g]])[1])
    store_g<-tau[,g]*log(pi[g]) + tau[,g]*dmvnorm(data_scaled, mu_g[[g]], w_g[[g]]%*%t(w_g[[g]])+Sig*diag(p),log = TRUE)
    store_g = matrix(store_g, ncol = 1)
    return(store_g)
  }

  all_g = sapply(seq(1,g,1), single_group, data_scaled, mu_g, w_g, Sig, pi,tau)
  obs_ll = sum(rowSums(all_g))
  return(obs_ll)

}

Aitken_criterion = function(ll_vector, iter){

  v = iter

  c_k = (ll_vector[v] - ll_vector[v-1])/(ll_vector[v-1] - ll_vector[v-2])
  c_k_minus1 =(ll_vector[v-1] - ll_vector[v-2])/(ll_vector[v-2] - ll_vector[v-3])

  lla_k = ll_vector[v-2] + (1 - c_k_minus1)^(-1)*(ll_vector[v-1] - ll_vector[v-2])
  lla_k_plus1 = ll_vector[v-1] + (1 - c_k)^(-1)*(ll_vector[v] - ll_vector[v-1])

  return(abs(lla_k_plus1 - lla_k) )

}

loadings_std_mppca = function(data, q, g, initial.guesses, B){

  one_replica = function(data,q,g, initial.guesses){
    n = nrow(data)
    data_boot = data[sample(nrow(data),size=n,replace=TRUE),] #sample the original data set
    EM = MPPCA_one_q_one_g(data_boot, q, g, eps =0.1, initial.guesses = initial.guesses)
    out = EM$loadings
    return(out)
  }

  ori_plan <- plan()
  #plan(multisession)

  list_loadings = 1:B %>%
    future_map(~ one_replica(data, q, g, initial.guesses))

  sd_by_group = function(g,list_loadings){
    loads = lapply(list_loadings, "[[", g)
    sd = apply(simplify2array(loads), c(1,2), sd)
    out = list(sd = sd)
    return(out)
  }

  sd_loadings = sapply(seq(1,g,1), sd_by_group, list_loadings)

  return(sd_loadings)
}
GweeXianYao/metaboliteR documentation built on Jan. 21, 2020, 7:18 a.m.