R/disclapmix2.R
In disclapmix2: Mixtures of Discrete Laplace Distributions using Numerical Optimisation

Documented in disclapmix2

#' @title Discrete Laplace mixture inference using Numerical Optimisation
#'
#' @param x DataFrame. Columns should be one character vector for each locus
#' @param number_of_clusters	The number of clusters to fit the model for.
#' @param include_2_loci Should duplicated loci be included or excluded from the analysis? 
#' @param remove_non_standard_haplotypes Should observations that are not single integer alleles be removed?
#' @param use_stripped_data_for_initial_clustering Should non_standard data be removed for the initial clustering?
#' @param initial_y_method Which cluster method to use for finding initial central haplotypes, y: pam (recommended) or clara.
#' @param verbose Set to 1 (or higher) to print optimisation details. Default is 0.
#' @description An extension to the *disclapmix* method in the *disclapmix* package that supports duplicated loci and other non-standard haplotypes.
#' @returns List.
#' @examples
#' require(disclapmix)
#'
#' data(danes) 
#'
#' x <- as.matrix(danes[rep(seq_len(nrow(danes)), danes$n), -ncol(danes)])
#' x2 <- as.data.frame(sapply(danes[rep(seq_len(nrow(danes)), danes$n), -ncol(danes)], as.character))
#'
#'
#' dlm_fit <- disclapmix(x, clusters = 3L)
#' dlm2_fit <- disclapmix2(x2, number_of_clusters = 3)
#'
#' stopifnot(all.equal(dlm_fit$logL_marginal, dlm2_fit$log_lik))
#' @export
disclapmix2 <- function(x, number_of_clusters, include_2_loci = FALSE, remove_non_standard_haplotypes = TRUE, 
                        use_stripped_data_for_initial_clustering = FALSE, initial_y_method = "pam",
                        verbose=0L){
  
  check_input_db(x)
  if (!(isTRUE(initial_y_method %in% c("pam", "clara")))){
    stop("initial_y_method needs to be \"pam\" or \"clara\"")
  }
  
  if (verbose>=1) verbose_print("Determining loci with suitable data")
  x_summarised <- summarise_db(x)
  
  # clean the data such that 1-loci only have 1 integer or NA
  # and 2-loci only have 2 integers or NA
  number_of_1_loci <- length(x_summarised$one_loci)
  number_of_2_loci <- if (include_2_loci) length(x_summarised$two_loci) else 0

  one_loci <- x_summarised$one_loci
  two_loci <- if (include_2_loci) x_summarised$two_loci else character()
  loci <- c(one_loci, two_loci)
  number_of_loci <- length(loci)
  
  locuslabel <- function(i) if (i==1) "locus" else "loci"
  if (verbose>=1) verbose_print("Found", number_of_1_loci, paste0("suitable 1-", locuslabel(number_of_1_loci)),
                                              "and", number_of_2_loci,paste0("suitable 2-", locuslabel(number_of_2_loci)),
                                              "among", ncol(x), "columns")
  
  x_cleaned <- matrix(data = character(), nrow = nrow(x), ncol = number_of_loci)
  colnames(x_cleaned) <- c(one_loci, if(number_of_2_loci>0) two_loci else NULL)
  
  if (verbose>=1) verbose_print("Setting any haplotype that is not 1 or 2 integers at the approriate locus to NA")
  removed_non_standard_dfs <- list()
  
  # locus = "DYS458"
  for(locus in one_loci){
    ind_1 <- which(x_summarised$x_ind_12_other[,locus]=="1")
    not_ind_1 <- which(sapply(x_summarised$x_ind_12_other[,locus]!="1", isTRUE))
    
    x_cleaned[ind_1,locus] <- x[[locus]][ind_1]
    
    number_removed <- length(not_ind_1)
    removed_non_standard_dfs[[1+length(removed_non_standard_dfs)]] <- data.frame(row = not_ind_1, locus=rep(locus, number_removed), haplotype=x[[locus]][not_ind_1])
  }
  
  if(number_of_2_loci > 0){
    for(locus in two_loci){
      ind_2 <- which(x_summarised$x_ind_12_other[,locus]=="2")
      not_ind_2 <- which(sapply(x_summarised$x_ind_12_other[,locus]!="2", isTRUE))

      x_cleaned[ind_2,locus] <- x[[locus]][ind_2]
      
      number_removed <- length(not_ind_2)
      
      removed_non_standard_dfs[[1+length(removed_non_standard_dfs)]] <- data.frame(row = not_ind_2, locus=rep(locus, number_removed), haplotype=x[[locus]][not_ind_2])
    }
  }
  
  # list out each removed non-standard haplotype
  removed_non_standard_df <- do.call(rbind, removed_non_standard_dfs)
  
  # list out the non-standard haplotypes
  non_standard_df <- unique(removed_non_standard_df[,c("locus","haplotype")])
  non_standard_df$locus <- factor(non_standard_df$locus, levels=loci)
  rownames(non_standard_df) <- NULL
  
  # assign index
  non_standard_df$index <- -seq_len(nrow(non_standard_df))
  if (nrow(non_standard_df) > 0){
    removed_non_standard_df$index <- non_standard_df$index[match(
      paste0(removed_non_standard_df$locus, "___", removed_non_standard_df$haplotype),
      paste0(non_standard_df$locus, "___", non_standard_df$haplotype))]
  }
  
  if (verbose>=1){
    if (nrow(removed_non_standard_df>1)){
      verbose_print("Set", nrow(removed_non_standard_df), "non-standard haplotypes to NA:")
      print(removed_non_standard_df)
    }
    else{
      verbose_print("All haplotypes are integer valued")
    }
  }
  
  # create a stripped down version of the db with only the easy profiles
  x_stripped <- x_cleaned[rowSums(is.na(x_cleaned))==0,]
  x_stripped_int <- to_int_db(x_stripped, one_loci, two_loci)
  
  if (remove_non_standard_haplotypes){
    if (verbose>=1) verbose_print("Removing profiles with any haplotype that is not 1 or 2 integers at the approriate locus")

    x_int <- x_stripped_int
  }else{
    if (verbose>=1) verbose_print("Not removing profiles with any haplotype that is not 1 or 2 integers at the approriate locus")
    x_int <- to_int_db(x_cleaned, one_loci, two_loci)
    
    observations_columns <- c(one_loci, if(number_of_2_loci > 0) paste0(two_loci,"(1)") else character())
    
    if (verbose>=1) verbose_print(sum(is.na(x_int[,observations_columns])), "NA observations in x_int")
  }
  
  if (verbose>=1) verbose_print(nrow(x_int) , paste0("profiles used in analysis (started with ", nrow(x),")"))
  if (verbose>=1) verbose_print("Determining initial clustering using", initial_y_method)
  
  if (!use_stripped_data_for_initial_clustering){
    
    if (initial_y_method=="pam"){
      initial_clustering <- cluster::pam(x_int, k = number_of_clusters, metric = "manhattan",diss = FALSE, keep.diss=FALSE, keep.data=TRUE)
    }
    else{
      initial_clustering <- cluster::clara(x_int, k = number_of_clusters, metric = "manhattan",pamLike = TRUE,samples=100, correct.d = TRUE)
    }
    
    if (anyNA(initial_clustering$medoids)){
      use_stripped_data_for_initial_clustering <- TRUE
      if (verbose>=1) verbose_print("NAs in initial clustering, retrying with stripped data")    }
  }else{
    if (verbose>=1) verbose_print("Using stripped data for initial clustering")
  }
  
  if (use_stripped_data_for_initial_clustering){
    
    if (initial_y_method == "pam"){
      initial_clustering <- cluster::pam(x_stripped_int, k = number_of_clusters, 
                                         metric = "manhattan",diss = FALSE, keep.diss=FALSE, keep.data=TRUE)
    }
    else{
      initial_clustering <- cluster::clara(x_stripped_int, k = number_of_clusters,
                                           metric = "manhattan", pamLike = TRUE, samples=100, correct.d = TRUE)
      
    }
    
  }
  
  initial_pam_tau <- (tabulate(initial_clustering$clustering, nbins = number_of_clusters)/nrow(x_int))
  initial_pam_theta_tau <- if(number_of_clusters==1L) numeric() else log( initial_pam_tau[1:(number_of_clusters-1)])
  
  # now find the variances
  y <- y0 <- initial_clustering$medoids
  theta <- theta0 <- if(number_of_clusters>1) c(initial_pam_theta_tau, rep(-1,number_of_clusters), rep(0,number_of_loci-1)) else rep(-1,number_of_loci)
  theta <- theta0 <- if(number_of_clusters>1) c(initial_pam_theta_tau, rep(-0.5,number_of_clusters), rep(-0.5,number_of_loci-1)) else rep(-1,number_of_loci)
  
  if (verbose>=1) verbose_print("Starting optimisation")
  
  cluster_labels <- paste0("cluster", seq_len(number_of_clusters))
  
  y_iterations <- list()
  theta_iterations <- list()
  
  optim_trace <- if (verbose>0) verbose - 1 else 0
  
  repeat{
    rownames(y) <- cluster_labels
    y_iterations[[1 + length(y_iterations)]] <- y
    
    f <- function(theta) {
      negll <- neg_loglik_theta(theta, x_int, y, number_of_1_loci, number_of_2_loci)
      
      negll
    }
    
    opt <- stats::optim(par = theta, fn = f, method = "BFGS", control = list(maxit=500, reltol = 1e-9, trace=optim_trace))
    if (opt$convergence != 0) stop("BFGS failed to converge")

    theta_opt <- opt$par
    theta_iterations[[1+length(theta_iterations)]] <- theta_opt
    
    tau_opt <- get_tau(theta = theta_opt, number_of_loci = number_of_loci, number_of_clusters = number_of_clusters)
    p_opt <- get_P(theta = theta_opt, number_of_loci = number_of_loci, number_of_clusters = number_of_clusters)
    
    rownames(p_opt) <- cluster_labels
    colnames(p_opt) <- loci
    
    x_profile_pr_by_cluster <- compute_profile_prs(p_by_cluster_and_locus = p_opt, db = x_int, y = y, number_of_1_loci, number_of_2_loci)
    v_matrix <- compute_posterior_cluster_prs(profile_pr = x_profile_pr_by_cluster, tau = tau_opt)
    
    # see if we need to move centers
    y_new <- move_centers(x_int, y, v_matrix)
    
    theta <- theta_opt
    
    dist_new_y <- sum(abs(y - y_new))
    if (dist_new_y == 0) {
      if (verbose >= 1L) {
        verbose_print("Current central haplotypes are optimal")
      }
      break
    } else if (any(duplicated(y_new))) { # new case introduced in disclapmix version 1.6.3
      if (verbose >= 1L) {
        verbose_print("New central haplotypes had at least one duplicated haplotype, change rejected")
      }
      break
    } else {
      if (verbose >= 2L) {      
        verbose_print("Current central haplotypes are not otimal")
        print(y)
        verbose_print("New central haplotypes:")
        print(y_new)
        verbose_print("Differences:")
        print(y_new - y)
        verbose_print("Number of stepwise mutations between center configurations = ", dist_new_y)
        
      } else if (verbose >= 1) {
        verbose_print("Current central haplotypes are not optimal, moving and restarting optimisation")
      }
      
      y <- y_new
    }
  }
  
  if((!remove_non_standard_haplotypes) & (nrow(removed_non_standard_df)>0)){
    
    repeat{
      verbose_print(nrow(removed_non_standard_df), "observations of",
                                  nrow(non_standard_df),
                                  "non-standard haplotypes were ignored, second stage estimation begins")
      
      # relabel the matrix such that non-standard alleles get negative indices
      x_int_ns <- x_int
      for(i in seq_len(nrow(removed_non_standard_df))){
        locus <- removed_non_standard_df$locus[i]
        column <- if (locus %in% one_loci) locus else paste0(locus, "(1)")
        x_int_ns[removed_non_standard_df$row[i], column] <- removed_non_standard_df$index[i]
      }
      if (verbose>=2){
        verbose_print("Relabeled data has", sum(x_int_ns<0, na.rm = TRUE), "negative indices")
      }
      ns_locus_haplotype <- paste0(non_standard_df$locus,"_", non_standard_df$haplotype)
      dimnames_q <- list(ns_locus_haplotype, cluster_labels)
      
      # keep trying to estimate pi,q and p until convergence
      pi_opt <- estimate_pr_ns(x = x_int_ns, v = v_matrix, number_of_1_loci = number_of_1_loci, number_of_2_loci = number_of_2_loci, locus_names = loci)
      q_opt <- estimate_q(x = x_int_ns, v = v_matrix, non_standard_haplotypes = non_standard_df, number_of_1_loci = number_of_1_loci, number_of_2_loci = number_of_2_loci)
      dimnames(q_opt) <- dimnames_q
      
      conv <- FALSE
      while(!conv){
        
        # optimise the variances again including the non-standard haplotypes
        f_ns <- function(theta) {
          negll <- neg_loglik_theta_ns(theta, x_int_ns, y, pi_opt, q_opt, number_of_1_loci, number_of_2_loci)
          
          # if (is.nan(negll) || is.infinite(negll)){
          #   theta_fail <<- theta
          #   browser()
          # }
          negll
        }
        
        opt_ns <- stats::optim(par = theta, fn = f_ns, method = "BFGS", control = list(reltol = 1e-16, trace = optim_trace))
        if (opt$convergence != 0) stop("BFGS failed to converge")
        theta <- opt_ns$par
        
        # make v-matrix again
        tau_opt <- get_tau(theta = theta_opt, number_of_loci = number_of_loci, number_of_clusters = number_of_clusters)
        p_opt <- get_P(theta = theta_opt, number_of_loci = number_of_loci, number_of_clusters = number_of_clusters)
        rownames(p_opt) <- cluster_labels
        colnames(p_opt) <- loci
        
        # recreate the v matrix
        x_profile_pr_by_cluster <- compute_profile_prs_ns(p_by_cluster_and_locus = p_opt, db = x_int, y = y, pi_opt, q_opt, number_of_1_loci, number_of_2_loci)
        v_matrix <- compute_posterior_cluster_prs(profile_pr = x_profile_pr_by_cluster, tau = tau_opt)
        
        # estimate pi
        pi_opt_next <- estimate_pr_ns(x = x_int_ns, v = v_matrix, number_of_1_loci = number_of_1_loci, number_of_2_loci = number_of_2_loci, locus_names = loci)
        
        # estimate q
        q_opt_next <- estimate_q(x = x_int_ns, v = v_matrix, non_standard_haplotypes = non_standard_df, number_of_1_loci = number_of_1_loci, number_of_2_loci = number_of_2_loci)
        dimnames(q_opt_next) <- dimnames_q
        
        abs_diff_pi <- sum(abs(pi_opt_next - pi_opt))
        abs_diff_q <- sum(abs(q_opt_next - q_opt))
        
        if (verbose >=2){
          verbose_print("Abs difference in pi:", abs_diff_pi)
          verbose_print("Abs difference in q:", abs_diff_q)
        }
        
        conv <- abs_diff_pi<1e-12 & abs_diff_q <1e-12
        pi_opt <- pi_opt_next
        q_opt <- q_opt_next
      }
      
      # check if the centers are optimal
      y_new <- move_centers(x_int, y, v_matrix) # note that we need NAs for non-standard haplotypes here
      
      dist_new_y <- sum(abs(y - y_new))
      
      if (dist_new_y > 0){
        verbose_print("Centers need to be moved after second stage")
        verbose_print("Number of stepwise mutations between center configurations = ", dist_new_y)
        
        y <- y_new
      } 
      else{
        verbose_print("Second stage finished: centers need not be moved")
        break
      }
    }
  }
  
  number_of_iterations <- length(y_iterations)
  
  if (verbose >= 1) {
    iterations_label <- function(i) if (i==1) "iteration" else "iterations"
    verbose_print("Finished after", number_of_iterations, iterations_label(number_of_iterations))
  }
  
  x_profile_pr = rowSums(rep(tau_opt, each= nrow(x_int)) * x_profile_pr_by_cluster)
  
  log_lik = sum(log(x_profile_pr))
  
  ret <-  list(
    y_iterations = y_iterations, theta_iterations = theta_iterations,
    x_int = x_int, profile_pr_by_cluster = x_profile_pr_by_cluster, 
    profile_pr = x_profile_pr, posterior_cluster_pr = v_matrix, 
    log_lik = log_lik,
    y = y, p = p_opt
  )
  
  if(exists("x_int_ns")) ret$x_int_ns <- x_int_ns
  if(exists("x_stripped_int")) ret$x_stripped_int <- x_stripped_int
  if(exists("pi_opt")) ret$pi <- pi_opt
  if(exists("q_opt")) ret$q <- q_opt
  
  if(!exists("q_opt")){
    ret$number_of_parameters <- length(theta) + length(y)
    ret$BIC <- -2 * log_lik + ret$number_of_parameters * log(length(x_int))
  }else{
    ret$number_of_parameters <- length(theta) + length(y) + length(q_opt)
    ret$BIC <- -2 * log_lik + ret$number_of_parameters * log(length(x_int_ns))
  }
  
  ret$tau <- tau_opt
  
  ret$one_loci <- one_loci
  ret$two_loci <- two_loci
  
  ret
}

check_input_db <- function(x){
  if (!is.data.frame(x)) stop("x should be a data frame")
  if (!all(sapply(x,is.character))) stop("columns of x should be character vectors")
  
  x_unlist <- unlist(x)
  illegal_haplotypes <- x_unlist[grep(" ", x_unlist)]
  if (length(illegal_haplotypes) > 1){
    stop("Found haplotypes containing space: ", illegal_haplotypes)
  }
}

to_int_db <- function(x, one_loci, two_loci){
  
  # convert the stripped down db to an integer matrix
  x_int_one_loci <- matrix(as.integer(unlist(x[, one_loci])), 
                           nrow = nrow(x))
  colnames(x_int_one_loci) <- one_loci
  
  x_int_two_loci <- matrix(integer(), nrow=nrow(x), ncol=2 * length(two_loci))
  if(length(two_loci) > 0) colnames(x_int_two_loci) <- paste0(rep(two_loci, each=2),c("(1)","(2)"))
  
  for( locus in two_loci){
    x_locus <- x[,locus]
    locus_split <- ifelse(is.na(x_locus), yes = list(c(NA_character_,NA_character_)), no =strsplit(x_locus, split = ","))
    
    x_int_two_loci[,paste0(locus,c("(1)","(2)"))] <- matrix(as.integer(unlist(locus_split)), nrow = nrow(x), ncol=2,byrow = TRUE)
  }
  
  x_int <- cbind(x_int_one_loci, x_int_two_loci)
  
  x_int
}
Any scripts or data that you put into this service are public.
disclapmix2 documentation built on April 12, 2023, 12:41 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
disclapmix2
Mixtures of Discrete Laplace Distributions using Numerical Optimisation

R/disclapmix2.R
In disclapmix2: Mixtures of Discrete Laplace Distributions using Numerical Optimisation

Defines functions to_int_db check_input_db disclapmix2

Documented in disclapmix2

Try the disclapmix2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

disclapmix2 Mixtures of Discrete Laplace Distributions using Numerical Optimisation

R/disclapmix2.R In disclapmix2: Mixtures of Discrete Laplace Distributions using Numerical Optimisation

Defines functions to_int_db check_input_db disclapmix2

Documented in disclapmix2

Try the disclapmix2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

disclapmix2
Mixtures of Discrete Laplace Distributions using Numerical Optimisation

R/disclapmix2.R
In disclapmix2: Mixtures of Discrete Laplace Distributions using Numerical Optimisation