R/beta_k.R

Defines functions beta_k

Documented in beta_k

globalVariables(c("k"))
#' @title Fit the K.. model
#' @description Fit the K.. model from the \code{\link[betaclust:betaclust]{betaclust}} family of beta mixture models for DNA methylation data.
#'              The K.. model analyses a single DNA sample type and identifies the thresholds between the different methylation states.
#'
#' @export
#'
#' @details The K.. model clusters each of the \eqn{C} CpG sites into one of \eqn{K} methylation states, based on data from \eqn{N} patients for one DNA sample type (i.e. \eqn{R = 1}).
#' As each CpG site can belong to any of the \eqn{M = 3} methylation states (hypomethylated, hemimethylated or hypermethylated), the default value of \eqn{K = M = 3}.
#' Under the K.. model the shape parameters of each cluster are constrained to be equal for each patient. The returned object from this function can be passed as an input parameter to the
#' \code{\link[betaclust:threshold]{threshold}} function available in this package to calculate the thresholds between the methylation states.
#'
#' @seealso \code{\link{beta_kn}}
#' @seealso \code{\link{betaclust}}
#' @seealso \code{\link{threshold}}
#'
#' @param data A dataframe of dimension \eqn{C \times N} containing methylation values for \eqn{C} CpG sites from \eqn{R = 1} sample type collected from \eqn{N} patients.
#' Samples are grouped together in the dataframe such that the columns are ordered as Sample1_Patient1, Sample1_Patient2, etc.
#' @param M Number of methylation states to be identified in a DNA sample type.
#' @param parallel_process The "TRUE" option results in parallel processing of the models for increased computational efficiency. The default option has been set as "FALSE" due to package testing limitations.
#' @param seed Seed to allow for reproducibility (default = NULL).
#'
#'
#' @return A list containing:
#' \itemize{
#'    \item cluster_size - The total number of CpG sites in each of the K clusters.
#'    \item llk - A vector containing the log-likelihood value at each step of the EM algorithm.
#'    \item alpha - The first shape parameter for the beta mixture model.
#'    \item delta - The second shape parameter for the beta mixture model.
#'    \item tau - The estimated mixing proportion for each cluster.
#'    \item z - A matrix of dimension \eqn{C \times K} containing the posterior probability of each CpG site belonging to each of the \eqn{K} clusters.
#'    \item classification - The classification corresponding to z, i.e. map(z).
#'    \item uncertainty - The uncertainty of each CpG site's clustering.    }
#'
#' @examples
#' my.seed <- 190
#' M <- 3
#' data_output <- beta_k(pca.methylation.data[1:30,2:5], M,
#'                       parallel_process = FALSE, seed = my.seed)
#' thresholds <- threshold(data_output, pca.methylation.data[1:30,2:5], "K..")
#' @importFrom foreach %dopar%
#' @importFrom stats C
#' @importFrom utils txtProgressBar


beta_k<-function(data,M=3,parallel_process=FALSE,seed = NULL){


  ######### K.. Model ##########
  X=data


  ## select the # of cores on which the parallel code is to run
  register=NULL
  if(is.null(register)){

    if (parallel_process==FALSE) {
      # use 2 cores in CRAN/Travis/AppVeyor
      ncores <- 2L
    } else {
      # use all cores in devtools::test()
      ncores <- parallel::detectCores()
    }
    #ncores = parallel::detectCores()
    my.cluster <- parallel::makeCluster(ncores-1)
    doParallel::registerDoParallel(cl = my.cluster)}

  ## declare aliases for dopar command
  `%dopar%` <- foreach::`%dopar%`
  `%do%` <- foreach::`%do%`

  X<-as.data.frame(X)
  if(any(is.na(X)))
  {
    stop("Missing values observed in dataset")
  }

  flag_uni=TRUE

  while(flag_uni){
      ## set the seed for reproducible work
      if (!is.null(seed))
        set.seed(seed)

      flag_uni=FALSE


    K=M
    ## Initial clustering using k-means
    k_cluster<-stats::kmeans(X,K)
    mem <- k_cluster$cluster
    data_clust<-cbind(X,mem)

    ## Get values for parameters C (no. of CpG sites), N (No. of patients)
    x=as.matrix(data_clust)
    mem=x[,ncol(x)]
    data_full=x
    x=x[,-ncol(x)]
    C=nrow(x)
    N=ncol(x)

    ##  starting values from initial clustering
    mu=vector("numeric",K)
    sigma=vector("numeric",K)
    mu_new=vector("numeric",K)
    sigma_sq=vector("numeric",K)
    alpha=vector("numeric",K)
    delta=vector("numeric",K)
    alpha_new=vector("numeric",K)
    delta_new=vector("numeric",K)
    term=vector("numeric",K)
    term_new=vector("numeric",K)
    tau=vector(length=K)
    tau_new=vector(length=K)


    ## function for calculating starting values for beta distribution parameters
    ## we consider the models unimodal in nature which restricts the parameters
    ## to be greater than 1.
    ini_theta_estimation = function(data,N,mem,C)
    {
      mu=vector()
      sigma=vector()
      term=vector()
      al=vector()
      de=vector()
      pi=vector()
      mu<-mean(data)
      sigma <- stats::sd(data)
      term=(mu*(1-mu)/(sigma^2))-1
      al=mu*term
      if(al<=1)
      {
        al=2
      }
      de=(1-mu)*term
      if(de<=1)
      {
        de=2
      }
      pi <- length(mem)/C
      return(list(al=al,de=de,tau=pi))
    }

    ## parallelization of starting value calculation
    ini_theta = foreach::foreach(k = 1:K, .combine=rbind) %dopar%
      {ini_theta_estimation(x[mem==k,],N,mem[mem==k],C)}

    ## unlisting the values returned from parallelized code
    mid=K+1
    len_theta=K*2
    tau_len=len_theta+1
    end=length(ini_theta)
    alpha=unlist(ini_theta[1:K])
    delta=unlist(ini_theta[mid:len_theta])
    tau=unlist(ini_theta[tau_len:end])


    ## Initialise complete data log-likelihood
    z <- matrix(NA,ncol = K,nrow=C)
    z_new <- matrix(NA,ncol = K,nrow=C)
    tol=1e-04
    l0=1e-07
    llk_iter<-vector(mode="numeric")
    llk_iter[1]=l0
    flag=TRUE

    while(flag)
    {
      ## E-step
      z_estimation = function(x,al,de,pi,N)
      {

        l3=1
        for(n in 1:N) ##no. of samples
        {

          l3=l3*stats::dbeta(x[,n],al,de)
        }

        return(pi*l3)
      }

      ## parallelization of E-step
      z = foreach::foreach(k = 1:K, .combine=cbind) %dopar%
        { z_estimation(x,alpha[k],delta[k],tau[k],N)}

      z <- sweep(z, 1, STATS = rowSums(z), FUN = "/")

      ## M-Step

      theta_estimation = function(x,z1,N)
      {
        y11=0
        y22=0
        al_new=0
        de_new=0
        y11=(sum(z1*rowSums(log(x))))/(N*sum(z1))
        y22=(sum(z1*rowSums(log(1-x))))/(N*sum(z1))
        term=0
        term=((exp(-y11)-1)*(exp(-y22)-1))-1
        al_new=0.5+(0.5*exp(-y22)/term)
        de_new= (0.5*exp(-y22)*(exp(-y11)-1))/term
        return(list(al_new=al_new,de_new=de_new))
      }


      ## parallelization of M-step
      theta = foreach::foreach(k = 1:K, .combine=rbind) %dopar%
        {theta_estimation(x,z[,k],N)}

      ## unlisting the values returned from parallelized code
      mid=K+1
      len_theta=K*2
      alpha_new=unlist(theta[1:K])
      delta_new=unlist(theta[mid:len_theta])


      ### Update z using new alpha and delta

      z_new = foreach::foreach(k = 1:K, .combine=cbind) %dopar%
        {z_estimation(x,alpha_new[k],delta_new[k],tau[k],N)}


      z.total=rowSums(z_new)
      z_new=z_new/z.total
      s=apply(z_new,2,sum)
      tau_new=s/C

      alpha=alpha_new
      delta=delta_new
      tau=tau_new

      if(any(alpha <1)){
        flag_uni=TRUE
      }else if(any(delta <1)){
        flag_uni=TRUE
      }else{
        flag_uni=FALSE
      }

      if(flag_uni==TRUE){
        if(!is.null(seed))
        {seed = seed+1}else seed=1
        break
      }

      l1=sum(log(z.total))
      llk_iter=c(llk_iter,l1)

      ##check convergence
      diff=abs(l1-l0)/abs(l1)
      flag<- diff>tol
      l0=l1

      }
  }


  ## Clustering
  mem_final<-matrix(NA,C,1)
  mem_final<-apply(z_new, 1, which.max)
  classification=mem_final
  cluster_count=table(mem_final)

  ### Uncertainty
  cert=apply(z_new,1,max)
  uc=1-cert

  parallel::stopCluster(cl=my.cluster)

  #### Return data
  return(list(cluster_size=cluster_count,llk=llk_iter,alpha=alpha,delta=delta,
              tau=tau,z=z_new,classification=classification,uncertainty=uc))

}

Try the betaclust package in your browser

Any scripts or data that you put into this service are public.

betaclust documentation built on Sept. 30, 2024, 9:30 a.m.