R/RegressHaplo_util.R

Defines functions penalized_regression.RegressHaplo penalized_regression_parameters.RegressHaplo haplotype_permute.RegressHaplo number_global_haplotypes.RegressHaplo compute_solution.RegressHaplo get_nonzero_solution.RegressHaplo get_loci.RegressHaplo get_fit.RegressHaplo get_df.RegressHaplo get_pi.RegressHaplo get_h.RegressHaplo nofilter_and_optimize.RegressHaplo filter_and_optimize.RegressHaplo

Documented in compute_solution.RegressHaplo filter_and_optimize.RegressHaplo nofilter_and_optimize.RegressHaplo penalized_regression_parameters.RegressHaplo penalized_regression.RegressHaplo

#' Generate consistent haplotypes for a read table and, if desired, apply
#' RegressHaplo optimization.
#'
#' Generates consistent haplotypes by filtering local haplotypes using
#' the RegressHaplo algorithm to satisfy dimensions requirements, and
#' if desired then applies the RegressHaplo algorithm globally.
#'
#' @param df read table
#' @param global_rho If a global fit should be computed, the rho that should
#' be used.
#' @param max_gobal_dim The maximum number of consistent haplotypes that
#' should be generated
#' @param max_local_dim The maximum number of haplotypes that can be filtered
#' @param min_cover The minimum read coverage needed to link across a
#' read table position
#' @param run_optimization Should an optimization be run, or should just
#' consistent global haplotypes be returned.
#'
#'
#' @details Haplotypes are generated by splitting the read table
#' positions into loci and then iteratively filtering the local haplotypes
#' using the RegressHaplo algorithm until all combinations of local haplotypes
#' have dimension less than max_global_dim.  At the outset, loci are defined
#' as positions linked by reads, but if a locus has too many consistent
#' haplotypes (> max_local_dim), then a locus is split in half until the
#' dimension is reduced.  This allows application of the RegressHaplo
#' algorithm locally.
#'
#' To run an optimization, run_optimization must be TRUE and a global_rho
#' must be provided.
#'
#' @return A list constaining the elements df, pi, fit, and h.   df
#' is simply the read_table returned.  h are the global
#' consistent haplotypes generated after filtering; h is a
#' character matrix with colnames giving positions. pi and fit
#' are NA if the optimization is not run, otherwhise pi is a vector
#' of frequencies with length equal to the number of haplotypes (nrow(h))
#' and fit is a scalar describing the fit of the solution.
filter_and_optimize.RegressHaplo <- function(df, global_rho=NULL,
                                 max_global_dim=10000,
                                 max_local_dim=1200,
                                 min_cover=500,
                                 run_optimization=F)
{
  # split the read table into unlinked read tables
  sdf <- split_read_table.RegressHaplo(df, min_cover=min_cover,
                                       max_dim=max_local_dim)

  # for each sdf, do a local pass and keep going until dimension is small
  # enough
  local_rho <- .001
  repeat {
    rh_local <- lapply(sdf, nofilter_and_optimize.RegressHaplo,
                       max_dim=max_local_dim,
                       min_cover=min_cover,
                       rho=local_rho)

    # extract all the haplotypes
    h_local <- lapply(rh_local, function(rh) {
      get_nonzero_solution.RegressHaplo(rh)$h
    })

    # form all permutations
    h_consistent <- haplotype_permute.RegressHaplo(h_local)

    if (nrow(h_consistent) <= max_global_dim)
      break
    else
      local_rho <- sqrt(10)*local_rho
  }
  colnames(h_consistent) <- pos_names.read_table(df)

  loci <- lapply(h_local, colnames)

#   # if not running optimization, then just return the consistent haplotypes
  if (!run_optimization | is.null(global_rho))
    return (list(df=df,
                 pi=NA,
                 fit=NA,
                 loci=loci,
                 h=h_consistent))

  # compute_solution
  solution <- compute_solution.RegressHaplo(df, h_consistent, global_rho)

  return (list(df=df,
               pi=solution$pi,
               fit=solution$fit,
               loci=loci,
               h=h_consistent))
}


#' Generate consistent haplotypes for a read table and applies a
#' RegressHaplo optimization.
#'
#' Generates consistent haplotypes across loci, with no filtering as
#' compared to filter_and_compute.RegressHaplo, and then if dimension
#' is small enough applies RegressHaplo algorithm
#'
#' @param df read table
#' @param rho The rho that should be used.
#' @param max_dim The maximum number of consistent haplotypes that
#' the RegressHaplo algorithm can handle.
#' @param max_dim The maximum number of haplotypes that can be filtered
#' @param min_cover The minimum read coverage needed to link across a
#' read table position
#'
#'
#' @details Haplotypes are generated by splitting the read table
#' positions into loci and then creating global haplotypes by
#' considering all possible local haplotype combinations.  If
#' the number of consistent haplotypes < max_dim, RegressHaplo is applied.
#'
#' To run an optimization, run_optimization must be TRUE and a global_rho
#' must be provided.
#'
#' @return #' @return A list constaining the elements df, pi, fit, and h.
#' df is simply the read_table returned.  h are the global
#' consistent haplotypes generated after filtering; h is a
#' character matrix with colnames giving positions. pi and fit
#' are NA if the optimization is not run, otherwhise pi is a vector
#' of frequencies with length equal to the number of haplotypes (nrow(h))
#' and fit is a scalar describing the fit of the solution.
nofilter_and_optimize.RegressHaplo <- function(df, rho,
                              max_dim=1200,
                              min_cover=500)
{
  if (nrow(df)==0)
    return (list(df=df, pi=NA, fit=NA, loci=NA, h=NA))

  h_consistent <- consistent_haplotypes_across_loci.read_table(df,
                                                               min_cover = min_cover,
                                                               max_num_haplotypes=max_dim)

  # check if there were too many haplotypes
  if (is.null(h_consistent))
    return (list(df=df,
                 pi=NA,
                 fit=NA,
                 loci=NA,
                 h=NA))

  colnames(h_consistent) <- pos_names.read_table(df)
  loci <- list(colnames(h_consistent))

  if (nrow(h_consistent) > max_dim)
    return (list(df=df,
                 pi=NA,
                 fit=NA,
                 loci=loci,
                 h=h_consistent))

  solution <- compute_solution.RegressHaplo(df, h_consistent, rho)

  return (list(df=df,
               pi=solution$pi,
               fit=solution$fit,
               loci=loci,
               h=h_consistent))
}



#########################
get_h.RegressHaplo <- function(rh)
{
  return (rh$h)
}

get_pi.RegressHaplo <- function(rh)
{
  return (rh$pi)
}

get_df.RegressHaplo <- function(rh)
{
  return  (rh$df)
}

get_fit.RegressHaplo <- function(rh)
{
  return (rh$fit)
}

get_loci.RegressHaplo <- function(rh)
{
  return (rh$loci)
}

get_nonzero_solution.RegressHaplo <- function(rh)
{
  h <- get_h.RegressHaplo(rh)
  pi <- get_pi.RegressHaplo(rh)

  ind <- (pi > 0)
  h_nonzero <- h[ind,,drop=F]
  pi_nonzero <- pi[ind]
  rownames(h_nonzero) <- round(100*pi_nonzero)

  ind <- order(pi_nonzero, decreasing = T)
  h_nonzero <- h_nonzero[ind,,drop=F]
  pi_nonzero <- pi_nonzero[ind]

  return (list(h=h_nonzero, pi=pi_nonzero))
}



#' Fit regression for a given rho
#'
#' @param df read table
#' @param h_full haplotype matrix
#' @param rho rho to be used in regression
#'
#' @return A list with entries pi and fit.  pi gives the frequencies
#' estimated for h_full and fit gives a fit value.
compute_solution.RegressHaplo <- function(df, h_full, rho,
                                          verbose=T)
{
  kk <- 2
  par <- penalized_regression_parameters.RegressHaplo(df, h_full)
  rout <- penalized_regression.RegressHaplo(par$y, par$P, pi=NULL,
                                               rho=rho, kk=kk,
                                            verbose=verbose)

  return (rout)
}

number_global_haplotypes.RegressHaplo <- function(local_rho, ldf, max_num_haplotypes)
{
  h_local <- Map(function(cdf, i) {

    haps <- consistent_haplotypes.read_table(cdf, rm.na=T,
                                             max_num_haplotypes = max_num_haplotypes)
    if (is.null(haps)) {
      cat("locus ", i, "has read table with too many paths\n")
      cat("read_table_to_loci.pipeline was not run.  If it was, please open issue in github.")
      stop("bad locus")
    }
    # rho==0 means no filtering, so don't waste time by regressin
    if (local_rho==0)
      return (haps)

    par <- penalized_regression_parameters.RegressHaplo(cdf, haps)
    rh <- penalized_regression.RegressHaplo(par$y, par$P, rho=local_rho, kk=2, verbose=F)
    pi <- get_pi.RegressHaplo(rh)

    filtered_haps <- haps[pi>0,,drop=F]

    return (filtered_haps)
  }, ldf, 1:length(ldf))

  nhaps_local <- sapply(h_local, nrow)
  total_haps <- prod(nhaps_local)

  return (total_haps)
}


haplotype_permute.RegressHaplo <- function(h_list)
{
  pos_list <- lapply(h_list, colnames)
  pos <- do.call(c, pos_list)

  # for now remove NA
  h_list <- lapply(h_list, function(h) {
    ind <- apply(h, 1, function(hrow) all(!is.na(hrow)))
    matrix(as.character(h[ind,]), nrow=sum(ind))
  })
  h_list_s <- lapply(h_list, function(h) {
    apply(h, 1, paste, collapse="")
  })

  h_df_s <- expand.grid(h_list_s)
  haps <- apply(h_df_s, 1, function(hvec) {
    hvec_paste <- paste(hvec, collapse="")
    strsplit(hvec_paste, split="")[[1]]
  })
  if (all(class(haps) != "matrix"))
    haps <- matrix(haps, ncol=1)
  else
    haps <- t(haps)

  colnames(haps) <- pos
  return (haps)
}

##################################################################
# Core RegressHaplo API:  parameter calculation and regression computation

#' Returns the matrices and vectors associated with the regression
#' optimization
#'
#' @details The regression solves
#' min |y - P*pi|^2 + rho pi^T*M*pi
#' subject to:  sum pi_i = 1, pi_i ge 0.
#'
#' @param df read table
#' @param h haplotype matrix
#'
#' @return the matrix P and the vector y as a list.  M is implicit
#' and so not computed.
#' @export
penalized_regression_parameters.RegressHaplo <- function(df, h, position_fit=F)
{
  # get position nucleotide counts
  #nucs_mat <- nucs_at_pos.read_table(df)
  K <- nrow(h)

  rf <- readFit(df, h, pi=NULL, position=position_fit)

  P_list <- lapply(rf, function(crf) crf$P)
  y_list <- lapply(rf, function(crf) crf$sampled_freq)

  P <- do.call(rbind, P_list)
  y <- do.call(c, y_list)

  # penalty matrix, it's concave!
  #M <- matrix(1, nrow=K, ncol=K)*(1-diag(K))

  return (list(y=y, P=P))
}

#' Solves min_pi |y - P*pi|^2 + rho*pi^T*M*pi
#' @export
penalized_regression.RegressHaplo <- function(y, P, pi=NULL, rho, kk=2,
                                              verbose=T)
{
  if (is.null(pi)) {
    pi <- runif(ncol(P))
    pi <- pi/sum(pi)
  }

  pi_full <- optimize.engine(y=y, P=P, rho=rho, pi=pi, mu=0, kk=kk,
                             verbose=verbose)

  # keep only significant frequencies
  ind <- pi_full > 10^-3
  pi_short0 <- pi_full[ind]
  P_short <- P[,ind,drop=F]

  if (length(pi_short0) > 1)
    pi_short <- optimize.engine(y=y, P=P_short, rho=0, pi=pi_short0,
                                mu=0, kk=kk, verbose=verbose)
  else
    pi_short <- pi_short0

  fit_vec <- y - P_short %*% pi_short
  fit <- sqrt(sum(fit_vec*fit_vec))

  # return solution limited to selected haplotypes and unbiased by penalty
  pi_full[!ind] <- 0
  pi_full[ind] <- pi_short

  return (list(pi=pi_full, fit=fit))
}
SLeviyang/RegressHaplo documentation built on June 1, 2022, 10:48 p.m.