# phasing.R
#
# TODO: na_thresh no longer used
#
# Refactor ideas
# - would be good to rework nested lapply()s into func
whichmaxNA <- function(x) {
# which.max that returns NA if any data is NA
ifelse(any(is.na(x)), NA, which.max(x))
}
haploDiff <- function(x) {
diff <- x[,1] == x[,2]
sum(diff, na.rm=TRUE)/nrow(x)
}
rowSumsNA <- function(x) {
apply(x, 1, function(x) {
if (all(is.na(x)))
return(NA)
sum(x, na.rm=TRUE)
})
}
#
#logSumExpPosterior <- function(liks, pi) {
# # log sum exp trick for list of matrices of loci x individuals, each for a
# # particular haplotype
# bc <- t(mapply(function(x, p) colSums(log(x)) + log(p), liks, pi))
#
# # denominator (same for both haplotype probs), uses TLOP
# max_bc <- apply(bc, 2, max)
# scaled_bc <- sweep(bc, 2, max_bc, FUN='-') # subtract B
# denom <- log(colSums(exp(scaled_bc))) + max_bc
# lse <- sweep(bc, 2, denom, FUN='-')
# browser()
# return(lse)
#}
logSumExpPosterior <- function(liks, pi) {
# log sum exp trick for list of matrices of loci x individuals, each for a
# particular haplotype
# note: cluster membership should be biased based on missingess since the same loci
# are missingin both ll calss
bc <- t(mapply(function(x, p) colSums(log(x), na.rm=TRUE) + log(p), liks, pi))
# denominator (same for both haplotype probs), uses TLOP
max_bc <- apply(bc, 2, max)
scaled_bc <- sweep(bc, 2, max_bc, FUN='-') # subtract B
denom <- log(colSums(exp(scaled_bc))) + max_bc
sweep(bc, 2, denom, FUN='-')
}
isConverged <- function(ll, ll_last, eps, debug=FALSE) {
conv <- abs(na.exclude(ll_last - ll)) < eps
if (debug) message(sprintf("%d of %d individuals converged", sum(conv), length(conv)))
if (all(conv)) return(TRUE)
return(FALSE)
}
reshapeIndLL <- function(lls_inds) {
# reshape the individual likelihoods from each iteration so that they're a
# dataframe
ll <- do.call(rbind, mapply(function(x, hap) {
d <- as.data.frame(do.call(rbind, x))
colnames(d) <- paste('ind', seq_len(ncol(d)), sep="_")
d$iter <- seq_len(nrow(d))
d$hap <- hap
d
}, lls_inds, 1L:2L, SIMPLIFY=FALSE))
setNames(melt(ll, id.vars=c('iter', 'hap')), c("iter", "hap", "ind", "ll"))
}
reshapeHapLL <- function(lls_haps) {
# reshape the loci likelihoods, from each iteration
do.call(rbind, mapply(function(x, i) {
d <- as.data.frame(x)
d$loci <- seq_len(nrow(x))
d$iter <- i
d[, 1] <- ifelse(d[, 1] == 0, NA, d[, 1])
d[, 2] <- ifelse(d[, 2] == 0, NA, d[, 2])
setNames(d, c("h1", "h2", "loci", "iter"))
}, lls_haps, seq_along(lls_haps), SIMPLIFY=FALSE))
}
#' Use EM Haplotype Phasing by Imputation Algorithm (internal function)
#'
#' @param pgeno progeny genotypes
#' @param fgeno father (or other parent) genotypes
#' @param freqs allele frequencies
#' @param other_parents indices to corresponding father genotype columns in fgeno
#' @param ehet heterozygous error rate
#' @param ehom homozygous error rate
#' @param free_pi whether to allow pi to vary (can cause identifiability issues
#' @param na_thresh how much missingness to tolerate before pruning individual
#' @param max_iter maximum number of EM iterations before quiting
#' @param init optional initial reponsibilities (advanced use)
#' @param extra include optional debugging output (advanced use)
#' @param eps epsilon in log likelihood before calling convergence
emHaplotypeImpute <- function(pgeno, fgeno, other_parents, freqs, ehet, ehom,
free_pi=FALSE, na_thresh=0.8, init=NULL, extra=FALSE,
max_iter=60L, eps=1e-9) {
# initialize EM pi and responsibility
nloci <- length(pgeno)
nind <- ncol(pgeno)
if (nloci == 0 || nind == 0)
stop(sprintf("emHaplotypeImpute(): nind or nloci = 0"))
pi <- c(0.5, 0.5)
resp <- matrix(0, nrow=nind, ncol=2)
# prune some individuals with high missingness, so we can get an initial
# estimate of cluster responsibility
# REMOVED; not needed anymore
#remove_inds <- pruneWhichIndividuals(pgeno, na_thresh)
# initialize responsibilities
if (is.null(init))
a <- runif(nind)
else
a <- init
init_resp <- resp <- rbind(a, 1-a)
# calculate likelihoods for each shared parent allele being (0, 1) ONCE for
# all individuals, progeny
# NOTE: we decrement other_parents since C++ is 0 indexed.
liks <- geno_ll(pgeno, fgeno, other_parents-1L, freqs, ehet, ehom)
thetas <- list()
lls_inds <- list(list(), list())
lls_haps <- list()
#clusters <- list()
converged <- FALSE
for (i in seq_len(max_iter)) {
if (i > 1) {
# get the total likehood given last MLE for all individuals' MLE alleles, for k in (1, 2)
# earlier version useding C++ func;
#mlm1 <- max_ll_matrix(liks[[1]], liks[[2]], theta_mle[, 1])
#mlm2 <- max_ll_matrix(liks[[2]], liks[[2]], theta_mle[, 2])
mlm1 <- do.call(rbind, lapply(1:nrow(theta_mle), function(i) liks[[1L+theta_mle[i, 1]]][i,]))
mlm2 <- do.call(rbind, lapply(1:nrow(theta_mle), function(i) liks[[1L+theta_mle[i, 2]]][i,]))
#browser()
# individual log likelihoods
ll1 <- colSums(log(mlm1), na.rm=TRUE)
ll2 <- colSums(log(mlm2), na.rm=TRUE)
# append these log likeloods to list; yes this is inefficient.... TODO
lls_inds[[1]] <- c(lls_inds[[1]], list(ll1))
lls_inds[[2]] <- c(lls_inds[[2]], list(ll2))
#browser()
# check for convergence, ll1(t), ll2(t), ll1(t-1), ll2(t-1)
# use t - 2 here, since we just appened these LLs to list
if (i > 2) {
converged <- isConverged(ll1, lls_inds[[1]][[i-2]], eps) && isConverged(ll2, lls_inds[[2]][[i-2]], eps)
if (converged)
break
}
# E step
ll_resp <- logSumExpPosterior(list(mlm1, mlm2), pi)
resp <- exp(logSumExpPosterior(list(mlm1, mlm2), pi))
#clusters <- c(clusters, list(resp))
#browser()
}
# M step
# compute pi weights
if (free_pi) # allow free-varying pi
pi <- rowMeans(resp)
# compute likelihoods for each (loci, individual). Returns a list of two matrices:
# likelihood for 0 and 1 alleles of theta_k (k = haplotype)
# TODO name, not log lik
ll_weighted <- lapply(1L:2L, function(k) {
lapply(liks, function(lik) {
# loops over likelihoods for an allele in (0, 1)
sweep(log(lik), MARGIN=2, STATS=resp[k, , drop=FALSE], FUN=`*`)
})
})
# log likelihoods of each loci's allele, for both haplotypes k in (1, 2)
# TODO: missingness handled acceptably here?
lls <- lapply(ll_weighted, function(hap) do.call(cbind, lapply(hap, function(x) rowSumsNA(x))))
#browser()
lls_haps <- c(lls_haps, lls)
# MLE alleles for each haplotype k in (1, 2)
#browser()
theta_mle <- do.call(cbind, lapply(lls, function(x) apply(x, 1, whichmaxNA)-1L))
thetas <- c(thetas, list(theta_mle))
}
iter_ind_lls <- reshapeIndLL(lls_inds)
iter_hap_lls <- reshapeHapLL(lls_haps)
stopifnot(ncol(resp) == length(other_parents)) # this should *always* be the case
colnames(resp) <- colnames(pgeno)
out <- list(haplos=theta_mle, cluster=resp, pi=pi, niter=i-1, converged=converged,
ll=rbind(ll1, ll2), init=a, haplos_lls=lls)
if (extra)
out <- c(out, list(iter_theta=thetas, iter_ind_lls=iter_ind_lls,
iter_hap_lls=iter_hap_lls))
out
}
#' Create a sibling family for a single parent.
#'
#' @param x a ProgenyArray object
#' @param parent the parent to bring the half sib family
#'
#' @export
sibFamily <- function(x, parent) {
pars <- parents(x)
# This builds a full-sib family
tmp <- pars[pars$parent_1 == parent | pars$parent_2 == parent,
c("progeny", "parent_1", "parent_2")]
if (nrow(tmp) == 0) {
warning(sprintf("sibFamily(): parent '%d' has no offspring", parent))
return(NULL)
}
prog_i <- tmp$progeny
other_parents <- apply(tmp[, -1], 1, function(x) {
if (x[1] == x[2]) return(parent) # selfed ind; return any parent
setdiff(x, parent)
})
data.frame(focal_parent=parent, other_parent=other_parents, progeny=prog_i)
}
#' Get all sibling families for a ProgenyArray object
allSibFamilies <- function(x) {
pars <- seq_len(ncol(parentGenotypes(x)))
parnames <- colnames(parentGenotypes(x))
setNames(lapply(pars, function(p) sibFamily(x, p)), parnames)
}
#' Filter sibling families by minimum number of offspring
filterSibFamilies <- function(sibfamilies, min_child) {
# parent_names is used for friendlier messages
msg <- "filterSibFamilies(): parent %d ('%s') has fewer than %d offspring (%d); not including in phasing/imputation."
filtered_sibfams <- Map(function(x, n) {
if (is.null(x)) return(NULL)
if (nrow(x) < min_child) {
warning(sprintf(msg, x$focal_parent[1], n, min_child, nrow(x)))
return(NULL)
}
return(x)
}, sibfamilies, names(sibfamilies))
return(filtered_sibfams)
}
#' Phase Sibling Family by Haplotype Imputation
#'
#' @param x a ProgenyArray object
#' @param sibfam a dataframe from \code{sibFamily()}
#' @param chrom which chromosome to use
#' @param tiles a PhasingTiles object
#' @param ehet heterozygous error rate
#' @param ehom homozygous error rate
#' @param na_thresh missingness threshold to remove individual from clustering
phaseSibFamily <- function(x, sibfam, chrom, tiles,
ehet=0.8, ehom=0.1,
na_thresh=0.8) {
if (!(chrom %in% seqlevels(x@ranges)))
stop(sprintf("chromosome '%s' not in loci", as.character(chrom)))
# get genotype data for this chromosome
this_chrom <- as.logical(seqnames(x@ranges) == chrom)
pgeno <- progenyGenotypes(x, seqname=chrom)
fgeno <- parentGenotypes(x, seqname=chrom)
stopifnot(nrow(pgeno) == nrow(fgeno))
freqs <- freqs(x)[this_chrom]
# impute each window using the indices in a tile
out <- lapply(tiles@tiles[[chrom]], function(loci) {
#message(sprintf("on loci number %d", min(loci)))
res <- emHaplotypeImpute(pgeno[loci, sibfam$progeny, drop=FALSE],
fgeno[loci, , drop=FALSE],
sibfam$other_parent,
freqs[loci], ehet, ehom, na_thresh=na_thresh)
stopifnot(ncol(res$cluster) == length(sibfam$other_parent))
# Store diagnostic information
res$nloci <- length(loci)
res$progeny_na <- sum(is.na(pgeno[loci, sibfam$progeny]))/length(pgeno[loci, sibfam$progeny])
res$progeny_na_per_loci <- rowMeans(is.na(pgeno[loci, sibfam$progeny]))
res$father_na <- sum(is.na(fgeno[loci, ]))/length(fgeno[loci, ])
res$completeness <- sum(apply(!is.na(res$haplos), 1, all))/nrow(res$haplos)
return(res)
})
out
}
phasingMetadata <- function(tiles, ehet, ehom, na_thresh) {
list(tile_type=tiles@type, tile_width=tiles@width,
ehet=ehet, ehom=ehom, na_thresh=na_thresh)
}
#' Phase all Parents in a ProgenyArray object
#'
#' @param x a ProgenyArray object
#' @param tiles a PhasingTiles object
#' @param ehet heterozygous error rate
#' @param ehom homozygous error rate
#' @param na_thresh how much missingness to tolerate before pruning individual
#' @param min_child minimum number of children to include a parent for phasing
#' @param verbose report status messages
#'
#' @export
setMethod("phaseParents", c(x="ProgenyArray"),
function(x, tiles, ehet=0.8, ehom=0.1, na_thresh=0.8, min_child=8, verbose=TRUE) {
ncores <- getOption("mc.cores")
lpfun <- if (!is.null(ncores) && ncores > 1) mcMap else Map
x@tiles <- tiles # add tiles to ProgenyArray Object
# first, note that some mothers may be inconsistent; that is
# the inferred parent may not be a mother included. We use NA for
# these.
# create sibling families, for all parents; filter out parents
# that don't have sufficient children for phasing/imputation
sibfams <- filterSibFamilies(allSibFamilies(x), min_child)
x@sibfams <- sibfams
phaseFun <- function(sibfam, parname, i, n) {
if (is.null(sibfam)) {
warning(sprintf("phaseParents(): skipping parent '%s'; no sib family data", parname))
return(NULL) # nothing to phase, either no progeny or too few
}
chroms <- names(x@tiles@tiles)# uses tile chromosome not slot @ranges!
setNames(lpfun(function(chr) {
msg <- sprintf("phasing parent '%s' (%d of %d), chrom %s", parname, i, n, chr)
if (verbose) message(msg)
phaseSibFamily(x, sibfam, chr, tiles, ehet=ehet,
ehom=ehom, na_thresh=na_thresh)
}, chroms), chroms)
}
x@phased_parents <- lpfun(phaseFun, sibfams, names(sibfams),
seq_along(sibfams), length(sibfams))
names(x@phased_parents) <- colnames(parentGenotypes(x))
x@phased_parents_metadata <- phasingMetadata(tiles, ehet, ehom, na_thresh)
return(x)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.