#' Generate consistent haplotypes for a read table and, if desired, apply
#' RegressHaplo optimization.
#'
#' Generates consistent haplotypes by filtering local haplotypes using
#' the RegressHaplo algorithm to satisfy dimensions requirements, and
#' if desired then applies the RegressHaplo algorithm globally.
#'
#' @param df read table
#' @param global_rho If a global fit should be computed, the rho that should
#' be used.
#' @param max_gobal_dim The maximum number of consistent haplotypes that
#' should be generated
#' @param max_local_dim The maximum number of haplotypes that can be filtered
#' @param min_cover The minimum read coverage needed to link across a
#' read table position
#' @param run_optimization Should an optimization be run, or should just
#' consistent global haplotypes be returned.
#'
#'
#' @details Haplotypes are generated by splitting the read table
#' positions into loci and then iteratively filtering the local haplotypes
#' using the RegressHaplo algorithm until all combinations of local haplotypes
#' have dimension less than max_global_dim. At the outset, loci are defined
#' as positions linked by reads, but if a locus has too many consistent
#' haplotypes (> max_local_dim), then a locus is split in half until the
#' dimension is reduced. This allows application of the RegressHaplo
#' algorithm locally.
#'
#' To run an optimization, run_optimization must be TRUE and a global_rho
#' must be provided.
#'
#' @return A list constaining the elements df, pi, fit, and h. df
#' is simply the read_table returned. h are the global
#' consistent haplotypes generated after filtering; h is a
#' character matrix with colnames giving positions. pi and fit
#' are NA if the optimization is not run, otherwhise pi is a vector
#' of frequencies with length equal to the number of haplotypes (nrow(h))
#' and fit is a scalar describing the fit of the solution.
filter_and_optimize.RegressHaplo <- function(df, global_rho=NULL,
max_global_dim=10000,
max_local_dim=1200,
min_cover=500,
run_optimization=F)
{
# split the read table into unlinked read tables
sdf <- split_read_table.RegressHaplo(df, min_cover=min_cover,
max_dim=max_local_dim)
# for each sdf, do a local pass and keep going until dimension is small
# enough
local_rho <- .001
repeat {
rh_local <- lapply(sdf, nofilter_and_optimize.RegressHaplo,
max_dim=max_local_dim,
min_cover=min_cover,
rho=local_rho)
# extract all the haplotypes
h_local <- lapply(rh_local, function(rh) {
get_nonzero_solution.RegressHaplo(rh)$h
})
# form all permutations
h_consistent <- haplotype_permute.RegressHaplo(h_local)
if (nrow(h_consistent) <= max_global_dim)
break
else
local_rho <- sqrt(10)*local_rho
}
colnames(h_consistent) <- pos_names.read_table(df)
loci <- lapply(h_local, colnames)
# # if not running optimization, then just return the consistent haplotypes
if (!run_optimization | is.null(global_rho))
return (list(df=df,
pi=NA,
fit=NA,
loci=loci,
h=h_consistent))
# compute_solution
solution <- compute_solution.RegressHaplo(df, h_consistent, global_rho)
return (list(df=df,
pi=solution$pi,
fit=solution$fit,
loci=loci,
h=h_consistent))
}
#' Generate consistent haplotypes for a read table and applies a
#' RegressHaplo optimization.
#'
#' Generates consistent haplotypes across loci, with no filtering as
#' compared to filter_and_compute.RegressHaplo, and then if dimension
#' is small enough applies RegressHaplo algorithm
#'
#' @param df read table
#' @param rho The rho that should be used.
#' @param max_dim The maximum number of consistent haplotypes that
#' the RegressHaplo algorithm can handle.
#' @param max_dim The maximum number of haplotypes that can be filtered
#' @param min_cover The minimum read coverage needed to link across a
#' read table position
#'
#'
#' @details Haplotypes are generated by splitting the read table
#' positions into loci and then creating global haplotypes by
#' considering all possible local haplotype combinations. If
#' the number of consistent haplotypes < max_dim, RegressHaplo is applied.
#'
#' To run an optimization, run_optimization must be TRUE and a global_rho
#' must be provided.
#'
#' @return #' @return A list constaining the elements df, pi, fit, and h.
#' df is simply the read_table returned. h are the global
#' consistent haplotypes generated after filtering; h is a
#' character matrix with colnames giving positions. pi and fit
#' are NA if the optimization is not run, otherwhise pi is a vector
#' of frequencies with length equal to the number of haplotypes (nrow(h))
#' and fit is a scalar describing the fit of the solution.
nofilter_and_optimize.RegressHaplo <- function(df, rho,
max_dim=1200,
min_cover=500)
{
if (nrow(df)==0)
return (list(df=df, pi=NA, fit=NA, loci=NA, h=NA))
h_consistent <- consistent_haplotypes_across_loci.read_table(df,
min_cover = min_cover,
max_num_haplotypes=max_dim)
# check if there were too many haplotypes
if (is.null(h_consistent))
return (list(df=df,
pi=NA,
fit=NA,
loci=NA,
h=NA))
colnames(h_consistent) <- pos_names.read_table(df)
loci <- list(colnames(h_consistent))
if (nrow(h_consistent) > max_dim)
return (list(df=df,
pi=NA,
fit=NA,
loci=loci,
h=h_consistent))
solution <- compute_solution.RegressHaplo(df, h_consistent, rho)
return (list(df=df,
pi=solution$pi,
fit=solution$fit,
loci=loci,
h=h_consistent))
}
#########################
get_h.RegressHaplo <- function(rh)
{
return (rh$h)
}
get_pi.RegressHaplo <- function(rh)
{
return (rh$pi)
}
get_df.RegressHaplo <- function(rh)
{
return (rh$df)
}
get_fit.RegressHaplo <- function(rh)
{
return (rh$fit)
}
get_loci.RegressHaplo <- function(rh)
{
return (rh$loci)
}
get_nonzero_solution.RegressHaplo <- function(rh)
{
h <- get_h.RegressHaplo(rh)
pi <- get_pi.RegressHaplo(rh)
ind <- (pi > 0)
h_nonzero <- h[ind,,drop=F]
pi_nonzero <- pi[ind]
rownames(h_nonzero) <- round(100*pi_nonzero)
ind <- order(pi_nonzero, decreasing = T)
h_nonzero <- h_nonzero[ind,,drop=F]
pi_nonzero <- pi_nonzero[ind]
return (list(h=h_nonzero, pi=pi_nonzero))
}
#' Fit regression for a given rho
#'
#' @param df read table
#' @param h_full haplotype matrix
#' @param rho rho to be used in regression
#'
#' @return A list with entries pi and fit. pi gives the frequencies
#' estimated for h_full and fit gives a fit value.
compute_solution.RegressHaplo <- function(df, h_full, rho,
verbose=T)
{
kk <- 2
par <- penalized_regression_parameters.RegressHaplo(df, h_full)
rout <- penalized_regression.RegressHaplo(par$y, par$P, pi=NULL,
rho=rho, kk=kk,
verbose=verbose)
return (rout)
}
number_global_haplotypes.RegressHaplo <- function(local_rho, ldf, max_num_haplotypes)
{
h_local <- Map(function(cdf, i) {
haps <- consistent_haplotypes.read_table(cdf, rm.na=T,
max_num_haplotypes = max_num_haplotypes)
if (is.null(haps)) {
cat("locus ", i, "has read table with too many paths\n")
cat("read_table_to_loci.pipeline was not run. If it was, please open issue in github.")
stop("bad locus")
}
# rho==0 means no filtering, so don't waste time by regressin
if (local_rho==0)
return (haps)
par <- penalized_regression_parameters.RegressHaplo(cdf, haps)
rh <- penalized_regression.RegressHaplo(par$y, par$P, rho=local_rho, kk=2, verbose=F)
pi <- get_pi.RegressHaplo(rh)
filtered_haps <- haps[pi>0,,drop=F]
return (filtered_haps)
}, ldf, 1:length(ldf))
nhaps_local <- sapply(h_local, nrow)
total_haps <- prod(nhaps_local)
return (total_haps)
}
haplotype_permute.RegressHaplo <- function(h_list)
{
pos_list <- lapply(h_list, colnames)
pos <- do.call(c, pos_list)
# for now remove NA
h_list <- lapply(h_list, function(h) {
ind <- apply(h, 1, function(hrow) all(!is.na(hrow)))
matrix(as.character(h[ind,]), nrow=sum(ind))
})
h_list_s <- lapply(h_list, function(h) {
apply(h, 1, paste, collapse="")
})
h_df_s <- expand.grid(h_list_s)
haps <- apply(h_df_s, 1, function(hvec) {
hvec_paste <- paste(hvec, collapse="")
strsplit(hvec_paste, split="")[[1]]
})
if (all(class(haps) != "matrix"))
haps <- matrix(haps, ncol=1)
else
haps <- t(haps)
colnames(haps) <- pos
return (haps)
}
##################################################################
# Core RegressHaplo API: parameter calculation and regression computation
#' Returns the matrices and vectors associated with the regression
#' optimization
#'
#' @details The regression solves
#' min |y - P*pi|^2 + rho pi^T*M*pi
#' subject to: sum pi_i = 1, pi_i ge 0.
#'
#' @param df read table
#' @param h haplotype matrix
#'
#' @return the matrix P and the vector y as a list. M is implicit
#' and so not computed.
#' @export
penalized_regression_parameters.RegressHaplo <- function(df, h, position_fit=F)
{
# get position nucleotide counts
#nucs_mat <- nucs_at_pos.read_table(df)
K <- nrow(h)
rf <- readFit(df, h, pi=NULL, position=position_fit)
P_list <- lapply(rf, function(crf) crf$P)
y_list <- lapply(rf, function(crf) crf$sampled_freq)
P <- do.call(rbind, P_list)
y <- do.call(c, y_list)
# penalty matrix, it's concave!
#M <- matrix(1, nrow=K, ncol=K)*(1-diag(K))
return (list(y=y, P=P))
}
#' Solves min_pi |y - P*pi|^2 + rho*pi^T*M*pi
#' @export
penalized_regression.RegressHaplo <- function(y, P, pi=NULL, rho, kk=2,
verbose=T)
{
if (is.null(pi)) {
pi <- runif(ncol(P))
pi <- pi/sum(pi)
}
pi_full <- optimize.engine(y=y, P=P, rho=rho, pi=pi, mu=0, kk=kk,
verbose=verbose)
# keep only significant frequencies
ind <- pi_full > 10^-3
pi_short0 <- pi_full[ind]
P_short <- P[,ind,drop=F]
if (length(pi_short0) > 1)
pi_short <- optimize.engine(y=y, P=P_short, rho=0, pi=pi_short0,
mu=0, kk=kk, verbose=verbose)
else
pi_short <- pi_short0
fit_vec <- y - P_short %*% pi_short
fit <- sqrt(sum(fit_vec*fit_vec))
# return solution limited to selected haplotypes and unbiased by penalty
pi_full[!ind] <- 0
pi_full[ind] <- pi_short
return (list(pi=pi_full, fit=fit))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.