R/misc_functions.R

Defines functions gmm_mad parametric_gmm_fit normpdf norm_lik zt_p zt

Documented in gmm_mad norm_lik normpdf parametric_gmm_fit zt zt_p

#' Z-score
#' 
#' \code{zt} returns the z-score (number of standard deviations from the mean)
#' of the input given a mean and variance
#' @param obs (numeric) value or vector of values
#' @param mean (numeric) the mean that should be used for calculation of z
#' @param sd (numeric) standard deviation for calculation of z
#' @param absval (boolean) should the absolute value of the z-score be returned?
#' @return (numeric) a vector of equal length to the input
#' @examples 
#' zt(1, 1, 1)
#' zt(0, 1, 1)
#' zt(10, 1, 1)
#' zt(1:5, 1, 0.3)
zt <- function(obs, mean, sd, absval = T){
  if(absval){
    abs((obs-mean)/sd)
  }else{
    (obs-mean)/sd
  }
}
#' Standardized normal p-value
#' 
#' \code{zt_p} returns \eqn{p(x)} by mapping x to a normal distribution with mean
#' equal to zero and standard deviation equal to one using its z-score
#' @param obs (numeric) value or vector of values
#' @param mean (numeric) the mean of the gaussian being compared against
#' @param sd (numeric) standard deviation of the gaussian being compared
#' against
#' @return (numeric) a p-value between zero and ~0.399
#' @examples 
#' zt_p(1, 1, 1)
#' zt_p(0, 1, 1)
#' zt_p(1:5, 1, 1)
zt_p <- function(obs, mean, sd){
  dnorm(zt(obs, mean, sd), mean = 0, sd = 1)
}

#' Likelihood function of a gaussian distribution
#' 
#' \code{norm_lik} returns the value of \eqn{L(x | \mu, \sigma)}, the
#' likelihood function of a value given a gaussian distribution
#' @param all_obs (numeric) vector of values over which to compute the 
#' likelihood
#' @param mean (numeric) mean of the gaussian
#' @param sd (numeric) standard deviation of the gaussian
#' @return (numeric) vector of values equal in length to x
norm_lik <- function(all_obs, mean, sd){
  ((2*pi*sd^2)^(-length(all_obs)/2))*exp(-1/(2*sd^2)*sum(all_obs - mean))
}

#' Probability density function of a gaussian distribution
#' 
#' \code{normpdf} returns the value of \eqn{f(x, \mu, \sigma^2)}, the
#' probability density function (PDF) of a gaussian distribution
#' @param x (numeric) vector of values over which to compute the PDF
#' @param mean (numeric) mean of the gaussian
#' @param sd (numeric) standard deviation of the gaussian
#' @return (numeric) vector of values equal in length to x
normpdf <- function(x, mean, sd){
  prob <- 1/sqrt(2 * pi * sd^2) * exp(-((x - mean)^2)/(2*sd^2))
  return(prob)
}

#' Fits a gaussian mixture model with specified parameters to the data agnostic
#' to weights
#' \code{parametric_gmm_fit} returns a matrix of standardized normal 
#' probabilities with rows as observations and columns as mixture components
#' @param data (numeric) vector of values to compute probailities for
#' @param means (numeric) vector of gaussian means, one per component
#' @param variances (numeric) vector of gaussian variances, either length one
#' or equal to the length of means
#' @return matrix of values with row count equal to the length of data and
#' column count equal to the length of the means
#' @examples 
#' parametric_gmm_fit(0:5, c(1, 5), 2)
parametric_gmm_fit <- function(data, means, variances){
  gaussians <- cbind(means, variances)
  result <- outer(data, gaussians[,1], "zt_p", "sd" = gaussians[,2])
  colnames(result) = row.names(gaussians)
  return(result)
}
#' Computes weighted median absolute deviations (MADs) for each component of a
#' gaussian mixture model
#' 
#' \code{gmm_mad} returns the weighted median absolute deviation of data fit to
#' the specified mixture model
#' @param data (numeric) vector of values to compute the MAD over
#' @param means (numeric) vector of means for the mixture moel
#' @param variances (numeric) vector of gaussian variances, either length one
#' or equal to the length of means
#' @return a single numeric of the MAD for the model
#' @examples 
#' gmm_mad(0:5, c(1, 5), 2)
gmm_mad <- function(data, means, variances){
  responsibilities <- compute_responsibilities(data, means, variances)
  weights <- colSums(responsibilities)/sum(colSums(responsibilities))
  devs <- abs(outer(data, means, "-"))
  mad <- lapply(1:ncol(devs), function(x)devs[,x]*responsibilities[,x]) %>% do.call(cbind, .) %>% rowSums %>% median
  return(mad)
}
#' Optimizes mean differential for a special case of gaussian mixture models
#' 
#' (GMMs) where component means differ by a constant factor
#' \code{gmm_em_fixed_var} returns a difference optimized using EM and a grid
#' search
#' @param data (numeric) vector of values to for the GMM to be fit to
#' @param means (numeric) vector of means for GMM components
#' @param var (numeric) vector of gaussian variances, either length one
#' or equal to the length of means
#' @return a single numeric value representing the difference in means
gmm_em_fixed_var <- function(data, means, var){
  i_diff <- mean(diff(means))
  p_mat <- parametric_gmm_fit(data, means, var)
  r_mat <- p_mat/rowSums(p_mat)
  
  lik <- mean(rowSums(p_mat * r_mat, na.rm = T))
  
  n_means = colSums(r_mat * data, na.rm = T)/colSums(r_mat, na.rm = T)
  ## new diff is a weighted mean of the diffs, weighted by the diff'ed total responsibilities
  n_diff <- weighted.mean(diff(n_means), na.rm = T, w = ((colSums(r_mat, na.rm = T) + shift(colSums(r_mat, na.rm = T)))/2)[-1])
  #n_diff <- mean(diff(n_means), na.rm = T)
  if(is.na(n_diff)){
    return(var)
  }
  n_means = seq(from = means[6] - n_diff*5, to = means[6] + n_diff*5, by = n_diff)
  p_mat <- parametric_gmm_fit(data, n_means, var)
  r_mat <- p_mat/rowSums(p_mat)
  
  n_lik <- mean(rowSums(r_mat * p_mat, na.rm = T))
  
  while(n_lik > lik){
    lik <- n_lik
    n_means = colSums(r_mat * data, na.rm = T)/colSums(r_mat, na.rm = T)
    n_diff <- mean(diff(n_means), na.rm = T)
    n_means = seq(from = means[6] - n_diff*5, to = means[6] + n_diff*5, by = n_diff)
    p_mat <- parametric_gmm_fit(data, n_means, var)
    r_mat <- p_mat/rowSums(p_mat, na.rm = T)
    n_lik <- mean(rowSums(r_mat * p_mat, na.rm = T))
  }
  condition = T
  t <- c()
  for(i in seq(from = i_diff, to = n_diff, length.out = 20)){
    p_mat <- parametric_gmm_fit(data, seq(from = means[6] - i*5, to = means[6] + i*5, by = i), var)
    r_mat <- p_mat/rowSums(p_mat)
    lik = mean(rowSums(p_mat * r_mat, na.rm = T))
    t <- c(t, lik)
  }
  out_diff <- seq(from = i_diff, to = n_diff, length.out = 20)[which.max(t)]
  return(out_diff)
}
#' Computes responsibilities for a gaussian mixture model with specified
#' parameters
#' 
#' \code{compute_responsibilities} returns a matrix as in parametric_gmm_fit
#' except values are responsibilities rather than probabilities
#' @inheritParams parametric_gmm_fit
#' @return matrix of values with row count equal to the length of data and
#' column count equal to the length of the means
compute_responsibilities <- function(data, means, variances){
  mat <- parametric_gmm_fit(data = data, means = means, variances = variances)
  resp <- mat/rowSums(mat)
  resp[is.na(resp)[,1],] <- 0
  return(resp)
}

#' Determines variance for a gaussian mixture model (GMM) given fixed means 
#' using a variation of expectation-maximization for the special case of all
#' equal variances for all components
#' 
#' \code{gmm_em_only_var} returns a value of standard deviation for the given
#' GMM
#' @param data (numeric) vector of values to for the GMM to be fit to
#' @param means (numeric) vector of means for GMM components
#' @param var (numeric) single value, initial estimate for variance
#' @return a single numeric value representing the difference in means
gmm_em_only_var <- function(data, means, initial_var){
  ## Record initial value for variances
  initial_vars <- initial_var
  initial_weights <- rep(1, times = length(means))
  #
  means <- means
  weights <- initial_weights
  vars <- initial_vars
  #
  old_lik <- 0
  new_lik <- 0.002
  liks <- data.frame("vars" = c(), "lik" = c())
  #
  iter <- 0
  while((((new_lik - old_lik)/old_lik)) > 0.001 | iter < 2){
    old_lik <- new_lik
    ## LIKELIHOOD
    gaussian_probs <- parametric_gmm_fit(data = data, means = means, variances = vars)
    
    ## EXPECTATION STEP
    ## Compute responsibilities
    
    responsibilities <- t(t(gaussian_probs) * weights)
    responsibilities <- responsibilities/rowSums(responsibilities)
    responsibilities[is.na(responsibilities)[,1],] <- 0
    
    new_lik <- rowMeans(gaussian_probs * responsibilities) %>% mean()
    ### Record parameter information
    liks <- rbind.data.frame(liks, data.frame("vars" = vars, "liks" = new_lik))
    
    if(all(gaussian_probs == 0)){
      vars = vars * 2
      next
    }
    
    ## MAXIMIZATION STEP
    ## Compute mixing proportions
    resp_sums <- colSums(responsibilities)
    resp_means <- colMeans(responsibilities)
    weights <- resp_sums/sum(resp_sums)
    ## Compute means, disabled atm
    #resp_means <- colSums(responsibilities * mafs)
    #means <- resp_means/colSums(responsibilities)
    ## Compute vars
    resp_vars <- outer(X = data, Y = means, FUN = "-")^2 * responsibilities
    resp_vars <- colSums(resp_vars)
    resp_vars <- sqrt(resp_vars/(resp_sums - resp_means))
    vars <- resp_vars * weights
    vars <- weighted.mean(vars, w = weights)
    iter <- iter + 1
  }
  pdf_fun <- mixpdf_function(means = means, proportions = weights, sd = vars)
  return(liks[nrow(liks) - 1,])
  plot(pdf_fun(seq(from = 0, to = 1, by = 0.01)), type = "l")
  rug(x = mafs)
}
#' Generate the probability density function (pdf) of the specified gaussian
#' mixture model (GMM)
#' 
#' \code{mixpdf} returns a vector of probability density over the values of x
#' @param x (numeric) vector of values over which to compute the pdf
#' @param means (numeric) vector of means for GMM components
#' @param proportions (numeric) vector of GMM component weights
#' @param sd (numeric) vector or single value of GMM component variance
#' @return a single numeric value representing the difference in means
mixpdf <- function(x, means, proportions, sd){
  parameters <- cbind.data.frame(means, proportions)
  parameters <- split(parameters, 1:nrow(parameters))
  out <- lapply(parameters, function(params){
    normpdf(x, params$means, sd)*params$proportions
  })
  out <- rowSums(rbindlist(cbind, out))
  return(out)
}
#' Generate a function that outputs the probability density function of a GMM
#' given an input range
#' 
#' \code{mixpdf_function} returns a vector of probability density over the values of x
#' @param means (numeric) vector of means for GMM components
#' @param proportions (numeric) vector of GMM component weights
#' @param sd (numeric) vector or single value of GMM component variance
#' @return a single numeric value representing the difference in means
mixpdf_function <- function(means, proportions, sd){
  outfun <- function(x){
    parameters <- cbind.data.frame(means, proportions, sd)
    parameters <- split(parameters, 1:nrow(parameters))
    out <- lapply(parameters, function(params){
      normpdf(x, params$means, params$sd)*params$proportions
    })
    out <- do.call(cbind, out) %>% apply(1, sum)
    out=data.frame("x" = x, "y" = out)
    return(out)
  }
  return(outfun)
}
#' Compute the root-mean-squared (rms) deviation of a vector
#' 
#' \code{rms} returns the rms of the input vector
#' @param x (numeric) input vector
#' @param na.rm (boolean) should NAs be removed?
#' @return the rmsd of the input
rms <- function(x, na.rm = T){
  sqrt(mean(x^2, na.rm = na.rm))
}
#' Convert vector into probability distribution
#' 
#' \code{props} returns the proportion-transformed version of input vector
#' @param x (numeric) input vector
#' @return a numeric vector of equal length to input
props <- function(x){
  x/sum(x)
}

#' Compute predicted depths based on coverage characteristics
#' 
#' \code{depth} returns the depth of of each specified copy number given the 
#' input parameters
#' @param maxpeak the depth of the most common copy number of the genome 
#' (eg. where CN = Ploidy)
#' @param d the depth difference per copy number. Can be calculated from 
#' \code{get_coverage_characteristics}
#' @param P the ploidy (most common copy number)
#' @param n a vector of all copy numbers to compute depths for
#' @return a numeric vector of predicted depths for each n, with names equal to 
#' n
depth <- function(maxpeak, d, P, n){
  return(structure(maxpeak - d * (P - n), names = n))
}

#' Compute tumour purity at a different ploidy
#' 
#' \code{tp_diffploidy} returns the tumour purity at the given ploidy given 
#' input coverage characteristics, as returned by get_coverage_characteristics
#' @param cov_char coverage chracteristics, as returned by 
#' get_coverage_characteristics
#' @param new_ploidy new ploidy to compute tumour purity for
#' @return single numeric value for tumour purity
tp_diffploidy <- function(cov_char, new_ploidy){
  d0 = depth(maxpeak = cov_char$maxpeak, d = cov_char$diff, P = new_ploidy, n = 0)
  d2 = depth(maxpeak = cov_char$maxpeak, d = cov_char$diff, P = new_ploidy, n = 2)
  tp = d0/(d2)
  return(1 - tp)
}

#'@export
translate_model_characteristics <- function(new_diff, ploidy, maxpeak){
  depth <- function(maxpeak, d, P, n){
    return(structure(maxpeak - d * (P - n), names = n))
  }
  d0 = depth(maxpeak = maxpeak, d = new_diff, P = ploidy, n = 0)
  d2 = depth(maxpeak = maxpeak, d = new_diff, P = ploidy, n = 2)
  tp = d0/(d2)
  return(1 - tp)
}
#' Compute cosine similarity of two vectors
#' 
#' \code{cosine_sim} returns the cosine similarity of two vectors
#' @param vec1 the first vector
#' @param vec2 the second vector
#' @return single numeric value for cosine similarity
cosine_sim <- function(vec1, vec2){
  if(length(vec1) != length(vec2)){
    stop("Two vectors have unequal length")
  }
  a <- sum(vec1 * vec2)
  b <- sqrt(sum(vec1^2))
  c <- sqrt(sum(vec2^2))
  csim <- a/(b*c)
  return(csim)
}

#' Return a vectorized counts table
#' 
#' \code{table_vec} returns a vector of counts of unique observations of input
#' @param x the input vector to count unique occurrances of
#' @return a numeric vector of counts, where names are the counted observation
#' and values are counts
table_vec <- function(x){
  tab <- table(x)
  vec <- as.vector(tab)
  names(vec) <- names(tab)
  return(vec)
}

#' Compute the n50 length of vector of lengths
#' 
#' \code{n50_segments} returns the n50 length of input lengths
#' @param lengths input vector of lengths
#' @return the n50 value
n50_segments <- function(lengths){
  lengths=sort(lengths)
  sumlengths=sum(as.numeric(lengths))
  currentsum=0
  index=1
  while(currentsum < 0.5*sumlengths){
    currentsum = currentsum + lengths[index]
    index=index+1
  }
  return(currentsum/(index-1))
}

#' Estimate variance by comparison to a density estimate
#' 
#' \code{match_kde_height} returns standard deviation such that the probability
#' density function of the gaussian mixture model (GMM) specified by the 
#' "means" parameter closely matches the kernel density estimation for the 
#' input data
#' @param data input data to estimate density and mix the mixture model to
#' @param means the means of the GMM
#' @param sd the initial estimate of standard deviation
#' @param comparison_point value to compute density function and density 
#' estimator height for optimization
#' @return an estimation of standard deviation
match_kde_height <- function(data, means, sd, comparison_point = NA){
  ## Refine SD by comparing with KDE of data
  if(length(means) > 1){
    kde_pad <- mean(diff(means))
  }else if(length(means) == 1){
    kde_pad <- quantile(data, 0.999) - means
  }
  kde_data <- density(data[data < (max(means))], n = 2^16, bw = "nrd0")
  
  ##Compute weights
  resp <- compute_responsibilities(data, means = means, variances = sd)
  proportions <- colSums(resp, na.rm = T)/sum(colSums(resp, na.rm = T), na.rm = T)
  
  if(!is.na(comparison_point)){
    maxpeak = comparison_point
  }else{maxpeak = kde_data$x[which.max(kde_data$y)]}
  
  pdf_fun <- mixpdf_function(means = means, proportions = proportions, sd = sd)
  
  plot_density_gmm(data, means, proportions, sd, bw = "nrd0")
  
  ## PDF function probs at maxpeak position
  
  pdf_means <- pdf_fun(maxpeak)$y
  
  ## Closest KDE points to maxpeak
  comp_height <- kde_data$y[which.min(abs(kde_data$x - maxpeak))]
  condition <- T
  sd_min <- sd/2
  sd_max = 5 * sd
  #sd_min = sd/5
  diffs <- c()
  values_vec <- seq(from = sd_min, to = sd_max, length.out = 10)
  values_diff <- diff(values_vec)[1]
  ## rearrange values_vec
  tmp_vec <- values_vec
  values_vec <- c()
  while(length(tmp_vec) > 0){
    values_vec <- c(values_vec, tmp_vec[1])
    values_vec <- c(values_vec, tmp_vec[length(tmp_vec)])
    tmp_vec <- tmp_vec[-c(1, length(tmp_vec))]
  }
  i = 1
  while(i <= length(values_vec)){
    resp <- compute_responsibilities(data, means = means, variances = values_vec[i])
    proportions <- colSums(resp)/sum(colSums(resp))
    pdf_fun <- mixpdf_function(means = means, proportions = proportions, sd = values_vec[i])
    ## PDF function probs at means
    pdf_means <- pdf_fun(maxpeak)$y
    ## Closest KDE points to maxpeak
    comp_height <- kde_data$y[which.min(abs(kde_data$x - maxpeak))]
    diffs <- c(diffs, (pdf_means - comp_height)/(pdf_means + comp_height))
    i = i+1
  }
  diffs <- diffs[order(values_vec)]
  values_vec <- values_vec[order(values_vec)]
  
  if((abs(sum(sign(diff(diffs)))) != length(diffs) - 1) & (all(diffs < 0) | all(diffs > 0))){
    return(values_vec[which.min(abs(diffs))])
  }
  if(all(diffs < 0) | all(diffs > 0)){
    return(match_kde_height(data = data, means = means, sd = values_vec[which.min(abs(diffs))]))
  }
  inflection_points <- which(diff(sign(diffs)) != 0)
  min_diff <- inflection_points[which.min(abs(diffs)[inflection_points])]
  neighbours <- c(min_diff-1,min_diff+1)
  neighbours <- neighbours[neighbours > 0]
  comp <- neighbours[which(sign(diffs[neighbours]) != sign(diffs[min_diff]))]
  if(length(comp) > 1){
    comp <- comp[which.min(abs(diffs[comp]))]
  }
  search_diffs <- sort(c(min_diff, comp))
  
  coef <- 0.5
  coef_add <- 0.25
  condition = T
  while(condition){
    new_var <- (values_vec[search_diffs[1]] * coef) + (values_vec[search_diffs[2]] * (1 - coef))
    ##
    resp <- compute_responsibilities(data, means = means, variances = new_var)
    proportions <- colSums(resp)/sum(colSums(resp))
    ##
    pdf_fun <- mixpdf_function(means = means, proportions = proportions, sd = new_var)
    
    plot_density_gmm(data, means, proportions, new_var)
    
    ## PDF function probs at means
    pdf_means <- pdf_fun(maxpeak)$y
    ## Closest KDE points to maxpeak
    comp_height <- kde_data$y[which.min(abs(kde_data$x - maxpeak))]
    
    diff <- (pdf_means - comp_height)/(pdf_means + comp_height)
    ## Decide which direction to go
    dir <- which(sign(diffs[search_diffs]) != sign(diff))
    
    if(dir == 1){
      coef <- coef + coef_add
    }else{
      coef <- coef - coef_add
    }
    coef_add <- coef_add/2
    if(abs(diff) < 0.01){
      condition <- F
    }
  }
  out_sd = new_var
  return(out_sd)
}

#' Fits a gaussian mixture-of-alleles model
#' 
#' \code{maf_gmm_fit} fits a gaussian mixture model for each possible
#' copy number state to VAF data, computes the probability of each copy number
#' and returns a matrix of probabilities for each copy number state
#' @param depth_data input read depth data for all bins
#' @param vaf_data input vaf data for all bins
#' @param chr_vec vector of chromosomes for all bins
#' @param means vector of mixture model means
#' @param variances standard deviation for the mixture model
#' @param vaf_variances standard deviation for VAF values
#' @param lowest optional unless ploidy specified; what is the copy number of 
#' the first element of means?
#' @param ploidy optional unless lowest specified; what is the most common 
#' copy number state?
#' @return a matrix of likelihoods, where columns are copy number states and 
#' rows are input observations
maf_gmm_fit <- function(depth_data, vaf_data, chr_vec, means, variances, maf_variances, maxpeak, lowest = NA, ploidy = NA){
  if(is.na(lowest) & is.na(ploidy)){
    stop("Must specify either lowest or ploidy")
  }
  # First we want to limit our VAF analysis to well-represented copy number 
  # states in the genome, so we filter out any GMM component that explains 
  # under 1% of the overall data
  ## Compute weights of initial fit from the responsibilities
  proportions = compute_responsibilities(data = depth_data, means = means, variances = variances)
  proportions = colSums(proportions)/sum(colSums(proportions))
  ## Identify components with at least 1% of weight to use as boundaries for 
  ## VAF fitting
  which.1pct <- proportions > 0.01
  which.1pct <- names(proportions)[which.1pct]
  which.1pct.min <- min(as.numeric(which.1pct))
  which.1pct.max <- max(as.numeric(which.1pct))
  onepct_pos <- means[names(means) %in% which.1pct.min:which.1pct.max]
  onepct_weights <- proportions[names(proportions) %in% which.1pct.min:which.1pct.max]
  ## Get depth difference per copy number
  d_diff <- diff(means)[1]
  # Next we identify observations that belong to the components with >1% 
  # responsibility
  ## Fit the GMM and obtain rough estimates of component membership
  peak_fits <- apply(parametric_gmm_fit(data = depth_data, means = means, variances = variances), 1, which.max)
  peak_in <- peak_fits %in% names(onepct_weights)
  ploidy_data <- data.frame("depth" = depth_data, "vaf" = vaf_data, CN = peak_fits)
  
  # Since the names/indices of "means" don't necessarily mean anything we need to 
  # shift the values for peak_fits to align with their assumed copy number 
  ploidy_data$CN <- factor(ploidy_data$CN)
  lowest_pk <- as.numeric(names(onepct_pos)[1])
  scale_fct <- lowest - lowest_pk
  ## If ploidy is specified then we have to do things slightly differently
  ## We assume that names(means)[1] == 1
  if(!is.na(ploidy)){
    scale_fct <- as.numeric(names(proportions[1])) - 1
    names(means) <- as.numeric(names(means)) - scale_fct
    lowest = as.numeric(names(proportions))[1]
    onepct_pos <- means
  }
  levels(ploidy_data$CN) <- as.numeric(levels(ploidy_data$CN)) + scale_fct
  names(means) <- as.numeric(names(means)) + scale_fct
  ploidy_data$CN <- as.numeric(as.character(ploidy_data$CN))
  ## Determine tp and ploidy
  test_ploidy <- as.numeric(names(means)[which.min(abs(means - maxpeak))])
  cov_char <- list(maxpeak = maxpeak, diff = d_diff)
  test_tp <- tp_diffploidy(cov_char, new_ploidy = test_ploidy)
  ## If the model is outrageous, return something indicating this
  if(test_tp > 1.05 | test_tp < 0 | is.na(test_tp)){
    return(data.frame("maf" = NULL, "tp" = NULL, "ploidy" = NULL, "lowest" = NULL))
  }
  # Next we prepare VAF data for fitting by first extracting VAF data
  maf_ind <- 1:nrow(ploidy_data)
  maf_ind <- maf_ind[!is.na(ploidy_data$vaf)]
  ## Gneerates a list where each element is a vector of VAFs that came from
  ## one bin
  maf_list <- lapply(unmerge_mafs(as.character(ploidy_data$vaf[!is.na(ploidy_data$vaf)])), as.numeric)
  ## Set names of the list to be their row index in the input and convert to a data.frame
  names(maf_list) <- maf_ind
  maf_df <- stack(maf_list)
  maf_df$ind <- as.numeric(levels(maf_df$ind)[maf_df$ind])
  ## Generate a linear model to determine general copy number from depth
  lm_df <- data.frame("CN" = as.numeric(names(means)), "depth" = means)
  model = lm(CN ~ depth, lm_df)
  cns <- table_vec(round(predict(model, data.frame("depth" = depth_data))))
  cns <- cns[as.numeric(names(cns)) >= 0]
  # Filters CN list to the nearest "sequential" CNs in the genome. Ie. the max
  # CN we consider is the highest CN that occurs twice in a row along a 
  # chromosome
  for(cn in rev(as.numeric(names(cns)))){
    vals <- which(round(predict(model, data.frame("depth" = depth_data))) == as.numeric(cn))
    if(length(vals) > 1){
      if(min(diff(vals)) == 1){
        cns <- cns[as.numeric(names(cns)) <= cn]
        break
      }
    }
  }
  max_cns <- max(as.numeric(names(cns[cns > 1])))
  # Subsample high CNs to reduce computational overhead
  if(max_cns > 10){
    cns <- c(0:9, seq(from = 10, to = min(max_cns, 15), by = 5))
    if(max_cns > 49){
      cns <- c(cns, seq(from = 50, to = min(max_cns, 90), by = 10))
    }
    if(max_cns > 99){
      cns <- c(cns, seq(from = 100, to = max_cns, by = 100))
    }
  }else{cns <- 0:max_cns}
  positions_mafs <- depth(maxpeak = maxpeak, d = d_diff, P = test_ploidy, n = cns)
  depth_maf_posterior <- parametric_gmm_fit(data = depth_data, means = positions_mafs, variances = variances)
  depth_maf_responsibilities <- depth_maf_posterior/rowSums(depth_maf_posterior)
  depth_maf_responsibilities[is.na(rowSums(depth_maf_responsibilities)),] <- 0
  # Next we perform VAF fitting using the depth probabilities as a prior
  ## Create a list of vectors where each element represents a copy number
  ## and the elements are vectors of allelic states
  arr <- lapply(cns, testMAF, tp = test_tp)
  ## Fill vectors to same length
  max_len <- max(sapply(arr, length))
  arr <- lapply(arr, function(x){
    x <- c(x, rep(NA, times = max_len - length(x)))
  })
  ## Fit a mixture model to the VAF data for each assumption of copy number 
  ## where mixture components are allelic states
  arr <- lapply(arr, function(x){
    x <- parametric_gmm_fit(data = maf_df$values, means = x, variances = maf_variances)
    x <- data.table(x)
    # Take mean likelihoods for each bin
    x <- x[,lapply(.SD, mean), by = maf_df$ind]
    return(x)
  })
  
  # Next we use the depth likelihoods as a prior to scale the VAF likelihoods
  range_vals <- 1:nrow(depth_maf_responsibilities)
  for(i in 1:length(arr)){
    # Get fits for the ith copy number
    i_arr <- data.table::copy(arr[[i]])
    # Get range of depth values with and without an associated VAF value
    i_range <- range_vals[!range_vals %in% i_arr$maf_df]
    nomaf <- data.table("maf_df" = i_range)
    # Fill columns for VAF-less bins with NAs to match i_arr
    while(ncol(nomaf) < ncol(i_arr)){
      nomaf <- cbind.data.frame(nomaf, data.table(rep(NA, times = nrow(nomaf))))
      names(nomaf) <- names(i_arr)[1:length(names(nomaf))]
    }
    na_cols <- names(i_arr)[which(apply(arr[[i]], 2, function(x)all(is.na(x))))]
    i_arr <- rbind(i_arr, nomaf)
    i_arr <- data.frame(i_arr[order(maf_df)])
    if(length(na_cols)){
      i_arr[,-(which(names(i_arr) %in% na_cols))][i_range,-1] <- depth_maf_posterior[i_range,i]
    }else{
      i_arr[i_range,-1] <- depth_maf_posterior[i_range, i]
    }
    i_arr <- data.table(i_arr)
    arr[[i]] = (i_arr[,-1] * depth_maf_responsibilities[,i])
  }
  ## For each copy number assumption (element of arr), we return the 
  ## probability of the most likely allele state at that copy number for each
  ## observation, and then cbind it all together to get a matrix of likelihoods
  ## where each column is a copy number state
  joint_probs <- lapply(arr, function(x){apply(x, 1, max, na.rm = T)}) %>% do.call(cbind, .)
  
  joint_probs <- data.table(joint_probs)
  names(joint_probs) <- as.character(cns)
  maf_posteriors <- joint_probs %>% apply(1, max) %>% mean()
  maf_scores <- data.frame("maf" = mean(maf_posteriors), "tp" = test_tp, "ploidy" = test_ploidy, "lowest" = lowest)
  return(list("model" = maf_scores, "jp_tbl" = joint_probs))
}

maf_gmm_fit_subclonal <- function(depth_data, vaf_data, chr_vec, means, variances, maf_variances, maxpeak, ploidy = NA){
  ## Compute weights of initial fit
  proportions = compute_responsibilities(data = depth_data, means = means, variances = variances)
  proportions = colSums(proportions)/sum(colSums(proportions))
  
  ## Identify components with at least 1% of weight to use as boundaries for VAF fitting
  obs_cns <- as.numeric(names(proportions))
  max_cns <- ploidy + 3
  
  incl_pos <- means[obs_cns < max_cns]
  incl_weights <- proportions[obs_cns < max_cns]
  
  int_means <- means[obs_cns - round(obs_cns) == 0]
  
  
  #which.1pct <- proportions > 0.01
  #which.1pct <- names(proportions)[which.1pct]
  #which.1pct.min <- as.numeric(which.1pct) %>% min
  #which.1pct.max <- as.numeric(which.1pct) %>% max
  #onepct_pos <- means[names(means) %in% which.1pct.min:which.1pct.max]
  #onepct_weights <- proportions[names(proportions) %in% which.1pct.min:which.1pct.max]
  
  d_diff <- diff(int_means)[1]
  
  ## get peak fits
  peak_fits <- parametric_gmm_fit(data = depth_data, means = means, variances = variances) %>% apply(1, which.max)
  
  peak_fits <- names(means)[peak_fits]
  
  peak_in <- peak_fits %in% obs_cns
  
  ploidy_data <- data.frame("depth" = depth_data, "vaf" = vaf_data, CN = peak_fits, stringsAsFactors = F)
  
  lm_df <- data.frame("CN" = as.numeric(names(means)), "depth" = means)
  
  model = lm(CN ~ depth, lm_df)
  
  cov_char <- list(maxpeak = maxpeak, diff = d_diff)
  
  test_tp <- tp_diffploidy(cov_char, new_ploidy = ploidy)
  
  cov_char <- get_coverage_characteristics(test_tp, ploidy, maxpeak)
  
  maf_ind <- 1:nrow(ploidy_data)
  maf_ind <- maf_ind[!is.na(ploidy_data$vaf)]
  
  maf_list <- lapply(unmerge_mafs(as.character(ploidy_data$vaf[!is.na(ploidy_data$vaf)])), as.numeric)
  
  names(maf_list) <- maf_ind
  
  maf_df <- stack(maf_list)
  
  maf_df$ind <- as.numeric(levels(maf_df$ind)[maf_df$ind])
  
  cns <- obs_cns
  
  positions_mafs <- cns * d_diff + cov_char$homd
  
  depth_maf_posterior <- parametric_gmm_fit(data = depth_data, means = positions_mafs, variances = variances)
  
  depth_maf_responsibilities <- depth_maf_posterior/rowSums(depth_maf_posterior)
  
  depth_maf_responsibilities[is.na(rowSums(depth_maf_responsibilities)),] <- 0
  
  arr <- lapply(cns, testMAF_sc, tp = test_tp)
  
  print(arr)
  
  max_len <- max(sapply(arr, length))
  
  ## Fill vectors to same length
  arr <- lapply(arr, function(x){
    x <- c(x, rep(NA, times = max_len - length(x)))
    #names(x) <- 0:(max_len-1)
  })
  
  arr <- lapply(arr, function(x){
    #print(x)
    #x <- t(as.matrix(x))
    maf_variances <- match_kde_height(data = maf_df$values, means = x[!is.na(x)], sd = 0.03)
    x <- parametric_gmm_fit(data = maf_df$values, means = x, variances = maf_variances)
    #t <- mixpdf_function(means = arr[[4]], proportions = colSums(x/rowSums(x))/sum(colSums(x/rowSums(x))), sd = maf_variances)
    #data.frame(x = density(maf_df$values)$x, y = density(maf_df$values)$y, prob = t(density(maf_df$values)$x)$y) %>% ggplot(aes(x = x, y = y)) + geom_line() + geom_line(aes(x = x, y = prob, color = "GMM")) + geom_vline(xintercept = arr[[4]], linetype = 3) + xlab("VAF") + ylab("Density")
    #x = as.data.frame(x)
    #x$f <- maf_df$ind
    x <- data.table(x)
    x <- x[,lapply(.SD, mean), by = maf_df$ind]
    return(x)
  }
  )
  
  range_vals <- 1:nrow(depth_maf_responsibilities)
  for(i in 1:length(arr)){
    i_arr <- data.table::copy(arr[[i]])
    i_range <- range_vals[!range_vals %in% i_arr$maf_df]
    nomaf <- data.table("maf_df" = i_range)
    while(ncol(nomaf) < ncol(i_arr)){
      nomaf <- cbind.data.frame(nomaf, data.table(rep(NA, times = nrow(nomaf))))
      names(nomaf) <- names(i_arr)[1:length(names(nomaf))]
    }
    na_cols <- names(i_arr)[which(apply(arr[[i]], 2, function(x)all(is.na(x))))]
    i_arr <- rbind(i_arr, nomaf)
    i_arr <- data.frame(i_arr[order(maf_df)])
    if(length(na_cols)){
      i_arr[,-(which(names(i_arr) %in% na_cols))][i_range,-1] <- depth_maf_posterior[i_range,i]
    }else{
      i_arr[i_range,-1] <- depth_maf_posterior[i_range, i]
    }
    i_arr <- data.table(i_arr)
    arr[[i]] = (i_arr[,-1] * depth_maf_responsibilities[,i])
  }
  
  joint_probs <- lapply(arr, function(x){apply(x, 1, max, na.rm = T)}) %>% do.call(cbind, .)
  
  joint_probs <- data.table(joint_probs)
  names(joint_probs) <- as.character(cns)
  
  maf_posteriors <- joint_probs %>% apply(1, max) %>% mean()
  
  maf_scores <- data.frame("maf" = mean(maf_posteriors), "tp" = test_tp, "ploidy" = ploidy)
  
  return(list("model" = maf_scores, "jp_tbl" = joint_probs))
  
}

maf_gmm_fit_subclonal_prior <- function(depth_data, vaf_data, chr_vec, means, variances, maf_variances, maxpeak, ploidy = ploidy, tp = tp, cn_list){
  ## Compute weights of initial fit
  proportions = compute_responsibilities(data = depth_data, means = means, variances = variances)
  proportions = colSums(proportions)/sum(colSums(proportions))
  
  depth_vaf_df <- split(data.frame("d" = depth_data, "v" = vaf_data, "c" = chr_vec), f = chr_vec)
  
  curr_d_diff <- get_coverage_characteristics(tp, ploidy, maxpeak)$diff
  
  tot_cns <- as.numeric(names(table_vec(do.call(c, cn_list))))
  
  chr_joints <- list()
  for(chromosome in names(cn_list)){
    chr_data <- depth_vaf_df[[chromosome]]
    chr_means <- depth(maxpeak = maxpeak, d = curr_d_diff, P = ploidy, n = cn_list[[chromosome]])
    depth_maf_posterior <- parametric_gmm_fit(data = chr_data$d, means = chr_means, variances = variances)
    depth_maf_responsibilities <- depth_maf_posterior/rowSums(depth_maf_posterior)
    depth_maf_responsibilities[is.na(rowSums(depth_maf_responsibilities)),] <- 0
    cns <- as.numeric(names(chr_means))
    arr <- lapply(cns, testMAF_sc, tp = tp)
    maf_ind <- 1:nrow(chr_data)
    maf_ind <- maf_ind[!is.na(chr_data$v)]
    maf_list <- lapply(unmerge_mafs(as.character(chr_data$v[maf_ind])), as.numeric)
    names(maf_list) <- maf_ind
    maf_df <- stack(maf_list)
    maf_df$ind <- as.numeric(levels(maf_df$ind)[maf_df$ind])
    
    max_len <- max(sapply(arr, length))
    
    ## Fill vectors to same length
    arr <- lapply(arr, function(x){
      x <- c(x, rep(NA, times = max_len - length(x)))
      #names(x) <- 0:(max_len-1)
    })
    
    arr <- lapply(arr, function(x){
      #x <- t(as.matrix(x))
      #maf_variances <- match_kde_height(data = maf_df$values, means = x[!is.na(x)], sd = 0.03)
      
      x <- parametric_gmm_fit(data = maf_df$values, means = x, variances = maf_variances)
      #plot_density_gmm(maf_df$values, means = arr[[4]][1:3], sd = maf_variances, weights = colSums(x/rowSums(x, na.rm = T), na.rm = T)[1:3])
      
      #t <- mixpdf_function(means = arr[[4]], proportions = colSums(x/rowSums(x))/sum(colSums(x/rowSums(x))), sd = maf_variances)
      #data.frame(x = density(maf_df$values)$x, y = density(maf_df$values)$y, prob = t(density(maf_df$values)$x)$y) %>% ggplot(aes(x = x, y = y)) + geom_line() + geom_line(aes(x = x, y = prob, color = "GMM")) + geom_vline(xintercept = arr[[4]], linetype = 3) + xlab("VAF") + ylab("Density")
      #x = as.data.frame(x)
      #x$f <- maf_df$ind
      x <- data.table(x)
      x <- x[,lapply(.SD, mean), by = maf_df$ind]
      return(x)
    })
    
    range_vals <- 1:nrow(depth_maf_responsibilities)
    for(i in 1:length(arr)){
      i_arr <- data.table::copy(arr[[i]])
      i_range <- range_vals[!range_vals %in% i_arr$maf_df]
      nomaf <- data.table("maf_df" = i_range)
      while(ncol(nomaf) < ncol(i_arr)){
        nomaf <- cbind.data.frame(nomaf, data.table(rep(NA, times = nrow(nomaf))))
        names(nomaf) <- names(i_arr)[1:length(names(nomaf))]
      }
      na_cols <- names(i_arr)[which(apply(arr[[i]], 2, function(x)all(is.na(x))))]
      i_arr <- rbind(i_arr, nomaf)
      i_arr <- data.frame(i_arr[order(maf_df)])
      if(length(na_cols)){
        i_arr[,-(which(names(i_arr) %in% na_cols))][i_range,-1] <- depth_maf_posterior[i_range,i]
      }else{
        i_arr[i_range,-1] <- depth_maf_posterior[i_range, i]
      }
      i_arr <- data.table(i_arr)
      arr[[i]] = (i_arr[,-1] * depth_maf_responsibilities[,i])
    }
    
    joint_probs <- lapply(arr, function(x){apply(x, 1, max, na.rm = T)}) %>% do.call(cbind, .)
    
    colnames(joint_probs) <- cns
    
    to_add <- tot_cns[!as.character(tot_cns) %in% colnames(joint_probs)]
    
    to_add_dt <- data.table(matrix(0, nrow = nrow(joint_probs), ncol = length(to_add)))
    
    names(to_add_dt) <- as.character(to_add)
    
    joint_probs <- cbind(to_add_dt, joint_probs)
    
    setcolorder(joint_probs, as.character(tot_cns))
    
    joint_probs <- data.table(joint_probs)
    chr_joints <- c(chr_joints, list(joint_probs))
    names(chr_joints)[length(chr_joints)] <- chromosome
  }
  chr_joints <- rbindlist(chr_joints)
  
  maf_posteriors <- chr_joints %>% apply(1, max) %>% mean()
  
  maf_scores <- data.frame("maf" = mean(maf_posteriors), "tp" = tp, "ploidy" = ploidy)
  
  return(list("model" = maf_scores, "jp_tbl" = chr_joints))
  
}

maf_gmm_fit_subclonal_prior_segments <- function(depth_data, vaf_data, chr_vec, means, variances, maf_variances, maxpeak, ploidy = ploidy, tp = tp, cn_list, pos_list, seg_tbl){
  ## Compute weights of initial fit
  proportions = compute_responsibilities(data = depth_data, means = means, variances = variances)
  proportions = colSums(proportions)/sum(colSums(proportions))
  
  seg_vec = rep(seg_tbl$segment, seg_tbl$n)
  
  depth_vaf_df <- split(data.frame("d" = depth_data, "v" = vaf_data, "c" = chr_vec), f = list(chr_vec, seg_vec), drop = T)
  
  curr_d_diff <- get_coverage_characteristics(tp, ploidy, maxpeak)$diff
  
  tot_cns <- as.numeric(names(table_vec(do.call(c, cn_list))))
  
  chr_joints <- list()
  for(line in 1:nrow(seg_tbl)){
    chromosome = seg_tbl$chr[line]
    chr_data <- depth_vaf_df[[line]]
    chr_means <- depth(maxpeak = maxpeak, d = curr_d_diff, P = ploidy, n = pos_list[[line]])
    names(chr_means) <- cn_list[[chromosome]]
    depth_maf_posterior <- parametric_gmm_fit(data = chr_data$d, means = chr_means, variances = variances)
    depth_maf_responsibilities <- depth_maf_posterior/rowSums(depth_maf_posterior)
    depth_maf_responsibilities[is.na(rowSums(depth_maf_responsibilities)),] <- 0
    cns <- as.numeric(names(chr_means))
    #arr <- lapply(cns, testMAF_sc, tp = tp)
    #maf_ind <- 1:nrow(chr_data)
    #maf_ind <- maf_ind[!is.na(chr_data$v)]
    #maf_list <- lapply(unmerge_mafs(as.character(chr_data$v[maf_ind])), as.numeric)
    #names(maf_list) <- maf_ind
    #if(length(maf_ind)){
    #  maf_df <- stack(maf_list)
    #  maf_df$ind <- as.numeric(levels(maf_df$ind)[maf_df$ind])
      
      
    #  max_len <- max(sapply(arr, length))
      
    #  ## Fill vectors to same length
    #  arr <- lapply(arr, function(x){
    #    x <- c(x, rep(NA, times = max_len - length(x)))
    #    #names(x) <- 0:(max_len-1)
    #  })
      
    #  arr <- lapply(arr, function(x){
        #x <- t(as.matrix(x))
        #maf_variances <- match_kde_height(data = maf_df$values, means = x[!is.na(x)], sd = 0.03)
        
    #    x <- parametric_gmm_fit(data = maf_df$values, means = x, variances = maf_variances)
        #plot_density_gmm(maf_df$values, means = arr[[4]][1:3], sd = maf_variances, weights = colSums(x/rowSums(x, na.rm = T), na.rm = T)[1:3])
        
        #t <- mixpdf_function(means = arr[[4]], proportions = colSums(x/rowSums(x))/sum(colSums(x/rowSums(x))), sd = maf_variances)
        #data.frame(x = density(maf_df$values)$x, y = density(maf_df$values)$y, prob = t(density(maf_df$values)$x)$y) %>% ggplot(aes(x = x, y = y)) + geom_line() + geom_line(aes(x = x, y = prob, color = "GMM")) + geom_vline(xintercept = arr[[4]], linetype = 3) + xlab("VAF") + ylab("Density")
        #x = as.data.frame(x)
        #x$f <- maf_df$ind
    #    x <- data.table(x)
    #    x <- x[,lapply(.SD, mean), by = maf_df$ind]
    #    return(x)
    #  })
      
    #  range_vals <- 1:nrow(depth_maf_responsibilities)
    #  for(i in 1:length(arr)){
    #    i_arr <- data.table::copy(arr[[i]])
    #    i_range <- range_vals[!range_vals %in% i_arr$maf_df]
    #    na_cols <- names(i_arr)[which(apply(arr[[i]], 2, function(x)all(is.na(x))))]
    #    if(length(i_range)){
    #      nomaf <- data.table("maf_df" = i_range)
    #      while(ncol(nomaf) < ncol(i_arr)){
    #        nomaf <- cbind.data.frame(nomaf, data.table(rep(NA, times = nrow(nomaf))))
    #        names(nomaf) <- names(i_arr)[1:length(names(nomaf))]
    #      }
         
    #      i_arr <- rbind(i_arr, nomaf)
    #      i_arr <- data.frame(i_arr[order(maf_df)])
    #    }
    #    if(length(na_cols) & length(i_range)){
    #      i_arr[,-(which(names(i_arr) %in% na_cols))][i_range,-1] <- depth_maf_posterior[i_range,i]
    #    }else if(length(i_range)){
    #      i_arr[i_range,-1] <- depth_maf_posterior[i_range, i]
    #    }
    #    i_arr <- data.table(i_arr)
    #    arr[[i]] = (i_arr[,-1] * depth_maf_responsibilities[,i])
    #  }
      
    #  joint_probs <- lapply(arr, function(x){apply(x, 1, max, na.rm = T)}) %>% do.call(cbind, .)
    #}else{joint_probs <- depth_maf_posterior}
    
    joint_probs <- depth_maf_posterior
    
    colnames(joint_probs) <- as.character(cns)
    
    to_add <- tot_cns[!as.character(tot_cns) %in% colnames(joint_probs)]
    
    to_add_dt <- data.table(matrix(0, nrow = nrow(joint_probs), ncol = length(to_add)))
    
    names(to_add_dt) <- as.character(to_add)
    
    joint_probs <- cbind(to_add_dt, joint_probs)
    
    setcolorder(joint_probs, as.character(tot_cns))
    
    joint_probs <- data.table(joint_probs)
    chr_joints <- c(chr_joints, list(joint_probs))
    names(chr_joints)[length(chr_joints)] <- chromosome
    #print(line)
  }
  chr_joints <- rbindlist(chr_joints)
  
  maf_posteriors <- chr_joints %>% apply(1, max) %>% mean()
  
  maf_scores <- data.frame("maf" = mean(maf_posteriors), "tp" = tp, "ploidy" = ploidy)
  
  return(list("model" = maf_scores, "jp_tbl" = chr_joints))
  
}

na_or_true <- function(x, flip = F){
  if(flip){
    return(!(is.na(x)) & x)
  }
  return((is.na(x)) | x)
}



plot_density_gmm <- function(data, means, weights, sd, bw = "nrd0", ...){
  data <- data[data < max(means)]
  weights <- weights/sum(weights)
  den <- density(data, n = 2^16, bw = bw)
  pdf_fun <- mixpdf_function(means, weights, sd)
  den_pdf <- data.frame(x = den$x, y = den$y, prob = pdf_fun(den$x)$y)
  plot <- den_pdf %>% filter(x < max(means)) %>% ggplot(aes(x = x, y = y)) + geom_line() + geom_line(aes(x = x, y = prob, color = "Predicted")) + geom_vline(xintercept = means[means < max(data)], alpha = 0.1)
  return(plot)
  
}

lagged_df <- function(in_df){
  out <- rbindlist(list(in_df, in_df[1,]))
  out[nrow(out),] <- NA
  return(out[-1,])
}

staggered_seq <- function(to){
  out <- seq(from = 0, to = 10)
  if(to <= 10){
    return(out)
  }
  out <- c(out, seq(from = 15, to = min(max(15, ceiling(to/5)*5), 50), by = 5))
  if(to <= 50){
    return(out)
  }
  out <- c(out, seq(from = 60, to = min(max(60, ceiling(to/10)*10), 100), by = 10))
  if(to <= 100){
    return(out)
  }
  out <- c(out, seq(from = 150, to = min(max(150, ceiling(to/50)*50), 1000), by = 50))
  if(to <= 1000){
    return(out)
  }
  out <- c(out, seq(from = 1250, to = max(1250, ceiling(to/250)*250), by = 250))
  return(out)
}

n50_fun <- function(lengths){
  target <- sum(as.numeric(lengths))*0.5
  lengths <- sort(lengths)
  to_add <- lengths[1]
  s_sum = to_add
  while(s_sum < target){
    to_add <- lengths[1]
    lengths <- lengths[-1]
    s_sum <- s_sum + to_add
  }
  return(lengths[1])
}

nx_fun <- function(lengths, n){
  if(n < 0 | n > 1){
    stop("Bad n value")
  }
  target <- sum(as.numeric(lengths))*n
  lengths <- sort(lengths)
  to_add <- lengths[1]
  s_sum = to_add
  while(s_sum < target){
    to_add <- lengths[1]
    lengths <- lengths[-1]
    s_sum <- s_sum + to_add
    #print(s_sum)
  }
  return(lengths[1])
}

len_fix <- function(x){
  if(length(x) == 0){
    NA
  }
  else{
    x
  }
}

weighted_median <- function(x, w){
  na_ind <- is.na(x)
  x <- x[!na_ind]
  w <- w[!na_ind]
  ord <- order(x)
  x <- x[ord]
  w <- w[ord]
  middling <- cumsum(w)/sum(w)
  which.mid <- max(which(middling <= 0.5))
  return(x[which.mid])
}

lowesswrapper <- function(x, y, bw){
  ord <- order(x)
  fit <- lowess(x = x, y = y, f = bw, iter = 3)
  out <- list()
  out$fitted[ord] <- fit$y
  out$residuals <- y - out$fitted
  return(out)
}

ind_to_log <- function(vec, ind){
  vec = rep(F, length(vec))
  vec[ind] = T
  return(vec)
}

estimateVariance <- function(to_seg, size = 10, compress_iters = 5, folds = 100){
  orig_seg <- to_seg
  var_list <- c()
  for(fold in 1:folds){
    to_seg <- orig_seg
    iter_seg <- to_seg
    lenseg <- length(to_seg)
    centers = sort(sample(lenseg, size))
    
    while(any(centers <= compress_iters | centers >= lenseg - compress_iters | any(abs(diff(centers)) < compress_iters*2 + 1))){
      print(centers)
      centers = sort(sample(lenseg, size))
    }
    #orig_centers <- centers
    to_seg <- unlist(lapply(centers, function(x){
      to_seg[(x-compress_iters):(x+compress_iters)]
    }))
    centers <- seq(from = 1 + compress_iters, by = compress_iters*2 + 1, length.out = size)
    lenseg <- length(to_seg)
    n = rep(1, lenseg)
    for(it in 1:(compress_iters)){
      g <- graph(edges = c(1, rep(2:(lenseg - 1), each = 2), lenseg), directed = F)
      ## Returns the left vs right side edge for the centers
      keep <- centers - as.numeric(na_or_true((abs(to_seg/n - shift(to_seg/n, type = "lag")) < abs(to_seg/n - shift(to_seg/n, type = "lead"))), flip = T))[centers]
      g <- delete_edges(g, E(g)[-keep])
      to_seg <- data.table("d" = to_seg, "s" = components(g)$membership)[,.(d=sum(d)),by = s]$d
      lenseg = length(to_seg)
      centers = unique(components(g)$membership[centers])
      n = rep(1, lenseg)
      n[centers] = it + 1
    }
    segs <- split(which(rep(n, n) > 1), f = rep(1:length(centers), each = compress_iters + 1))
    
    var_list <- c(var_list, median(unlist(lapply(segs, function(x)sd(iter_seg[x])))))
  }
  return(median(var_list))
}

seg <- function(to_seg, lenseg, old_seg, n, transition_lik, init_var){
  reference = to_seg
  if(all(n == 1)){
    vars = rep(init_var, lenseg)
  }else{
    v = data.table(cbind(reference, rep(1:length(n), n)))
    print(v)
    setnames(v, c("reference", "V2"))
    v[,n:=.N, by = V2]
    v[n > 2,v:=sd(reference), by = V2]
    v[n <= 2, v:= init_var]
    vars = v[,.(v = first(v)), by = V2]$v
  }
  to_seg = aggregate(reference, FUN = sum, by = list(rep(1:length(n), n)))[,2]
  
  while(lenseg < old_seg & lenseg > 1){
    old_seg <- lenseg
    g <- graph(edges = c(1, rep(2:(lenseg - 1), each = 2), lenseg), directed = F)
    
    lp <- zt_p(to_seg/n, shift(to_seg/n, type = "lag"), shift(vars, type = "lag"))[-1]

    breaks <- which(lp < transition_lik)
    
    g <- delete_edges(g, breaks)
    
    to_seg <- data.table("d" = to_seg, "n" = n, "s" = components(g)$membership, "v" = vars)[,lapply(.SD, sum), by = s, .SDcols = 1:2]
    #to_seg[,m:=mean(d), by = s][,v.i:= (d - m)^2]
    
    #to_seg[,n:=sum(n), by = s]
    
    #to_seg = to_seg[,.(d = sum(d), n = first(n), v = sqrt(sum(v.i)/(n-1))), by = s]
    
    
    n <- to_seg$n
    to_seg <- to_seg$d
    lenseg = length(to_seg)
    if(length(reference) != length(rep(1:length(n), n))){
      print(length(reference))
      print(n)
    }  
    v = data.table(cbind(reference, rep(1:length(n), n)))
  
    v[,n:=.N, by = V2]
    v[n > 2,v:=sd(reference), by = V2]
    v[n <= 2, v:= init_var]
    vars = v[,.(v = first(v)), by = V2]$v
  }
  return(list(to_seg, lenseg, n))
}
seed_compress <- function(to_seg, compress_iters = 10, transition_lik = 0.01, var = estimateVariance(to_seg, size = 100, compress_iters = compress_iters)){
  orig_seg <- to_seg
  lenseg = length(to_seg)
  old_seg <- Inf
  n = rep(1, lenseg)
  ## Segmentation using input variance estimate
  r <- seg(to_seg, lenseg, old_seg, n, transition_lik = transition_lik, init_var = var)
  old_seg <- lenseg
  to_seg <- r[[1]]
  lenseg <- r[[2]]
  n <- r[[3]]
  if(lenseg  == 1){
    return(list("segs" = rep(1, times = length(orig_seg)), "ll" = sum(log(normpdf(orig_seg, mean = mean(orig_seg), sd = sd(orig_seg))))))
  }

  old_seg <- lenseg
  ## Merge segments with size < compress iterations
  for(minN in 1:compress_iters){
    if(all(n > minN)){
      next
    }
    while(any(n <= minN)){
      #print(which(n <= minN))
      lp <- zt_p(to_seg/n, shift(to_seg/n, type = "lag"), var)
      lp[1] <- -1
      ls <- zt_p(to_seg/n, shift(to_seg/n, type = "lead"), var)
      ls[length(ls)] <- -1
      lp <- lp[n <= minN]
      ls <- ls[n <= minN]
      transitions = pmin(lp, ls)
      merge <- which(n <= minN) - as.numeric(ls < lp)
      merge <- merge[order(transitions)[1]]
      g <- graph(edges = c(1, rep(2:(lenseg - 1), each = 2), lenseg), directed = F)
      g <- delete_edges(g, (1:length(E(g)))[-merge])
      to_seg <- data.table("d" = to_seg, "n" = n, "s" = components(g)$membership)[,lapply(.SD, sum), by = s, .SDcols = 1:2]
      n <- to_seg$n
      to_seg <- to_seg$d
      lenseg = length(to_seg)
    }
  }
  
  if(lenseg == 1){
    return(list("segs" = rep(1, times = length(orig_seg)), "ll" = sum(log(normpdf(orig_seg, mean = mean(orig_seg), sd = sd(orig_seg))))))
  }
  
  ## Rerun segmentation
  if(F){
    r <- seg(orig_seg, lenseg, old_seg, n, transition_lik = transition_lik, init_var = var)
    old_seg <- lenseg
    to_seg <- r[[1]]
    lenseg <- r[[2]]
    n <- r[[3]]
  }
  
  
  d <- data.table(orig_seg, "s" = rep(1:length(n), n))
  d[,segged:=mean(orig_seg), by = s]
  d[,v:=sd(orig_seg), by = s]

  LL <- sum(log(normpdf(d$orig_seg, mean = d$segged, sd = d$v)))

  return(list("segs" = rep(1:length(n), n), "ll" = LL))
}

cdf <- function(x, mean, sd){
  pnorm(q = zt(x, mean, sd))
}

sweep_seeds <- function(to_seg, compress_iters = 10, transition_lik = 0.01, var = estimateVariance(to_seg, size = 100, compress_iters = 10)){
  #target_var = var
  ## Find variance for all = one segment
  current_var = sd(to_seg)
  
  ## Bisection search
  max_iters = 2^10
  iter = 1
  
  ## Initial likelihood value
  trans_lik = 0
  ## If variance is too high, need to drop threshold (returns negative). Conversely if too low, need to up threshold (positive)
  direction <- sign(current_var - vars[length(vars)])
  vars <- c()
  liks <- c()
  lls <- c()
  nsegs <- c()
  for(trans_lik in seq(from = 0, to = 1, by = 0.01)){
    #trans_lik = trans_lik + direction/2^iter
    c = seed_compress(to_seg, compress_iters = 10, transition_lik = trans_lik, var)
    condition = T
    d <- data.table("d" = to_seg, "s" = c$segs, "n" = 1:length(to_seg))
    ds <- d[,.(d = mean(d), v = sd(d)), by = s]
    if(nrow(ds) == 1){
      condition = F
    }
    while(condition){
      ds <- ds[order(d)]
      p <- ks.test(d[s == ds[1]$s]$d, d[s == ds[2]$s]$d, alternative = "greater")$p.value
      if(p > 0.05){
        d[s == ds[2]$s]$s <- d[s == ds[1]$s]$s[1]
        ds[s == ds[2]$s]$s <- ds[s == ds[1]$s]$s
      }else{
        if(exists("done_dt")){
          done_dt <- rbind(done_dt, d[s == ds[1]$s])
        }else{
          done_dt <- d[s == ds[1]$s]
        }
        d <- d[s != ds[1]$s]
      }
      ds <- d[,.(d = mean(d), v = sd(d)), by = s]
      if(nrow(ds) == 1){
        condition = F
        done_dt <- rbind(done_dt, d[s == ds[1]$s])[order(n)]
        d <- done_dt
        rm(done_dt)
      }
    }
    
    d[,m:=mean(d), by = s]
    d[,v:=sd(d), by = s]
    c$ll <- sum(log(normpdf(d$d, mean = d$m, sd = d$v)))
    nsegs <- c(nsegs, length(unique(d$s)))
    liks <- c(liks, trans_lik)
    lls <- c(lls, c$ll)
    vars <- c(vars, median(data.table("d" = to_seg, "s" = c$segs)[,.(v = sd(d)), by = s]$v))
    #direction <- sign(current_var - vars[length(vars)])
    iter = iter + 1
  }
  aic = 2*nsegs - 2*log(exp(lls/length(to_seg)))
  ## Find models with lowest aic
  mins <- which(aic == min(aic))
  
}

ks.seg <- function(to_seg, existing_segs, p){
  #dt <- data.table(to_seg, existing_segs)
  split_segs <- split(to_seg, f = existing_segs)
  
  n = table_vec(existing_segs)
  
  lp = unlist(lapply(2:length(split_segs), function(x){ks.test(split_segs[[x - 1]], split_segs[[x]])$p.value}))
  
  g <- graph(edges = c(1, rep(2:(length(unique(existing_segs)) - 1), each = 2), length(unique(existing_segs))), directed = F)
  
  g <- delete_edges(g, which(lp <= p))
  
  out_segs <- rep(components(g)$membership, n)
  
  if(length(unique(out_segs)) != length(unique(existing_segs)) & length(unique(out_segs)) > 1){
    return(ks.seg(to_seg, out_segs, p))
  }else{
    return(out_segs)
  }
}


seed_seg_existing <- function(to_seg, old_segs, existing_segs){
  orig_dat <- to_seg
  to_seg <- orig_dat
  
#  ks_examples <- split(to_seg, old_segs)
  
#  if(length(ks_examples) < 2){
#    p_thresh = 0.01
#  }else{
#    p_thresh = unlist(lapply(2:length(ks_examples), function(x){ks.test(ks_examples[[x - 1]], ks_examples[[x]])$p.value}))
#    p_thresh = max(p_thresh)
#  }
  p_thresh = 0.1

  
  
  dt <- data.table(to_seg, existing_segs)
  seg_var <- dt[,.(sd = sd(to_seg), n = .N), by = existing_segs]
  n = seg_var$n
  lenseg = length(to_seg)
  
  #var = estimateVariance(to_seg, compress_iters = round(lenseg/50), size = 100)
  ## First merge length one segs
  while(any(n < 2)){
    to_seg <- dt[,.(to_seg = sum(to_seg)), by = existing_segs]$to_seg
    lenseg = length(to_seg)
    g <- graph(edges = c(1, rep(2:(lenseg - 1), each = 2), lenseg), directed = F)
    lp <- abs(to_seg/n - shift(to_seg/n, type = "lag"))
    lp[1] <- Inf
    ls <- abs(to_seg/n - shift(to_seg/n, type = "lead"))
    ls[length(ls)] <- Inf
    merge <- which(n < 2) - as.numeric(lp < ls)[n < 2]
    g <- delete_edges(g, (1:length(E(g)))[-merge])
    to_seg <- data.table("d" = to_seg, "n" = n, "s" = components(g)$membership)[,lapply(.SD, sum), by = s, .SDcols = 1:2]
    n <- to_seg$n
    to_seg <- to_seg$d
    lenseg = length(to_seg)
  }
  
  segs <- rep(1:length(n), n)
  
  if(length(n) > 1){
    segs <- ks.seg(orig_dat, segs, p_thresh)
  }else{segs <- rep(1, n)}
  return(segs)
}
  
cn_from_dp <- function(dp, maxpeak, tp, ploidy){
  (dp - get_coverage_characteristics(tp, ploidy, maxpeak)$homd)/get_coverage_characteristics(tp, ploidy, maxpeak)$diff
}
#' @import data.table
#' 
lculibrk/Ploidetect documentation built on May 18, 2023, 5:53 p.m.