#' \code{Phasing haplotypes of focal parent}
#'
#' Phasing focal parent's haplotypes using parents' genotypic data and progeny array's genotypic data.
#' Parents' genotypic data can be either confident genotyping data, i.e., WGS data, or
#' imputed genotypic data generated by \code{impute_parent}. This function was composed of two major parts:
#' \code{phase_chunk} and \code{join_chunks}. The haplotype was phased using a sliding window and extended to
#' a large chunk of sequence. The chunks were then joined into chromosomal wide haplotypes.
#'
#' @param GBS.array A GBS.array object generated from \code{impute_parent} or \code{sim.array} functions.
#' The slot \code{gbs_parents} replaced with imputed parental genotypes.
#' @param win_length Window length used for halotype phasing. Default=10.
#' win_length > 20 will dramatically increase computational burden.
#' @param join_length The length of each neighboring chunks used to connect them into a longer one.
#' @param verbose Writing verbose messages. Default=TRUE.
#'
#' @return return a data.frame with all the log likelihoods. Using function \code{momgeno} to extract mom's genotype.
#'
#' See \url{https://github.com/yangjl/imputeR/blob/master/vignettes/imputeR-vignette.pdf} for more details.
#'
#' @examples
#'
#' GBS.array <- sim.array(size.array=50, numloci=1000, imiss=0.2, selfing=0.5)
#'
#'
phase_parent <- function(GBS.array, win_length=10, join_length=10, verbose=TRUE, OR=log(3)){
#### phasing chunks
ped <- GBS.array@pedigree
hetsites <- which(GBS.array@gbs_parents[[ped$p1[1]]]==1)
if(verbose){ message(sprintf("###>>> start to phase hap chunks with [ %s ] hetsites ...",
length(hetsites))) }
haps <- setup_haps(win_length)
chunklist <- phase_chunk(GBS.array, win_length, haps, verbose)
#### joining chunks
if(join_length > 0 & join_length <= win_length){
if(verbose){ message(sprintf("###>>> start to join hap chunks ...")) }
if(length(chunklist) > 1){
out <- join_chunks(GBS.array, chunklist, verbose, join_length, OR)
if(verbose){ message(sprintf("###>>> Reduced chunks from [ %s ] to [ %s ]",
length(chunklist), length(out))) }
chunklist <- out
}else{
if(verbose){ message(sprintf("###>>> Only one chunk in total!")) }
}
}
#### return data.frame
out <- output_phase(chunklist, info=GBS.array@snpinfo)
if(verbose){
message(sprintf("###>>> phased [ %s percentage (%s/%s) ] of the heterozygote sites",
round(nrow(out)/length(hetsites),3)*100, nrow(out), length(hetsites) )) }
return(out)
}
#' @rdname phase_parent
output_phase <- function(outchunks, info){
momdf <- data.frame()
for(i in 1:length(outchunks)){
tem <- data.frame(chunk=i, idx=outchunks[[i]][[3]], snpid=info$snpid[outchunks[[i]][[3]]],
hap1=outchunks[[i]][[1]], hap2=outchunks[[i]][[2]])
momdf <- rbind(momdf, tem)
}
return(momdf)
}
#' @rdname phase_parent
phase_error_rate <- function(GBS.array, phase){
ped <- GBS.array@pedigree
if(length(unique(ped$p1)) != 1){
stop("### more than one focal parent!!!")
}
pidx <- unique(ped$p1)
true_p <- GBS.array@true_parents[[pidx]]
p <- subset(true_p, hap1 != hap2)
out <- merge(phase, p, by.x="idx", by.y="row.names", all=TRUE)
#idx <- which.max(c(cor(out$hap1.x, out$hap1.y), cor(out$hap1.x, out$hap2.y)))
#er <- sum(out$hap1.x != out[,4+idx])/nrow(out)
#message(sprintf("###>>> phasing error rate [ %s ] for [ %s ] heterozygote sites.",
# round(er,3), nrow(out)))
out[is.na(out)] <- -9
return(out)
}
#'
#' \code{Phasing haplotype chunks. }
#'
#' Phasing Parent's haplotypes using sliding window.
#' These sliding windows were merged into a chunk if there is no conflict. Otherwise, they will be
#' assigned into different chunks.
#'
#' @param GBS.array A GBS.array object generated from \code{impute_parent} or\code{sim.array} functions.
#' @param win_length A list of vectors of Kid's GBS data. Coded with 0, 1 and 2, which is the copy of alternative alleles.
#' Missing data should be coded with 3.
#' @param haps all possible haplotypes (2^(win_length-1)) for a given window length.
#' @param verbose Writing verbose messages. Default=TRUE.
#'
#' @return return A list of haplotype chunks.
#'
#' @examples
#' GBS.array <- sim.array(size.array=50, numloci=1000)
#' win_length = 10
#' haps <- setup_haps(win_length)
#' probs <- error_mx(hom.error=0.02, het.error=0.8, imiss=0.2)
#' chunklist <- phase_chunk(GBS.array, win_length, haps, verbose=TRUE)
#'
phase_chunk <- function(GBS.array, win_length, haps, verbose, OR){
ped <- GBS.array@pedigree
fidx <- unique(ped$p1)
if(length(fidx) == 1){
hetsites <- which(GBS.array@gbs_parents[[fidx]]==1)
}else{
stop("!!! error! more than one focal parent in pedigree !!!")
}
# gets all possible haplotypes for X hets
haplist <- list()
wds <- round(length(hetsites)/win_length, 0)
pb <- txtProgressBar(min = 1, max = wds, style = 3)
#for(wdi in 1:(wds-1)){
# winidx <- hetsites[(win_length*(wdi-1)+1):(win_length*wdi)]
# win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
# haplist[[wdi]] <- list(win_hap, 1-win_hap, winidx)
# if(verbose){ setTxtProgressBar(pb, wdi) }
#}
### try to speed up a little bit
haplist <- lapply(1:(wds-1), function(wdi){
winidx <- hetsites[(win_length*(wdi-1)+1):(win_length*wdi)]
win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
if(verbose){ setTxtProgressBar(pb, wdi) }
return(list(win_hap, 1-win_hap, winidx))
})
### phase last window
winidx <- hetsites[(win_length*(wds-1)+1):length(hetsites)]
if(length(winidx) >= (win_length + 5)){
winidx <- hetsites[(win_length*(wds-1)+1):(win_length*wds)]
haps <- setup_haps(win_length)
win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
haplist[[wds]] <- list(win_hap, 1-win_hap, winidx)
### last bit
winidx <- hetsites[(win_length*wds+1):length(hetsites)]
haps <- setup_haps(win_length=length(winidx))
win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
haplist[[wds+1]] <- list(win_hap, 1-win_hap, winidx)
}else{
haps <- setup_haps(win_length=length(winidx))
win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
haplist[[wds]] <- list(win_hap, 1-win_hap, winidx)
}
if(verbose){ setTxtProgressBar(pb, wds) }
close(pb)
## list: hap1, hap2 and idx; info
return(haplist)
}
#' @rdname phase_chunk
infer_dip <- function(GBS.array, winidx, haps, returnhap=FALSE){
# momwin is list of heterozygous sites, progeny list of kids genotypes,
# haps list of possible haps,momphase1 is current phased mom for use in splitting ties
#### function for running one hap ####
phase_probs <- unlist(lapply(1:length(haps), function(a) {
sum_max_log_1hap(GBS.array, winidx, dad_hap=haps[[a]])
#print(a)
}))
#if multiple haps tie, check each against current phase and return one with smallest distance
if(length(which(phase_probs==max(phase_probs)))>1){
if(returnhap){
return(haps[[sample(which(phase_probs==max(phase_probs)), 1)]])
} else{
return(NULL)
}
}else{
return(haps[[which.max(phase_probs)]])
}
}
# log likelyhood of one focal parent's hap x all other haps for all kids
#' @rdname phase_chunk
sum_max_log_1hap <- function(GBS.array, winidx, dad_hap=haps[[a]]){
ped <- GBS.array@pedigree
pgeno <- GBS.array@gbs_parents
kgeno <- GBS.array@gbs_kids
### for outcrossed kids:
ped1 <- subset(ped, p1 != p2)
if(nrow(ped1) >0){
maxlog1 <- lapply(1:nrow(ped1), function(x) {
mymom <- pgeno[[ped1$p2[x]]]
if(!is.null(nrow(mymom))){ #phased mom
temmom <- mymom[winidx, ]
mom_geno <- temmom$hap1 + temmom$hap2
missidx <- which(mom_geno > 2)
if(length(missidx) >0){
mom_geno <- mom_geno[-missidx]
if(length(mom_geno) == 0){
maxlog1 = 0
}else{
mom_haps <- setup_mom_haps(temmom[-missidx, c("hap1", "hap2", "chunk")])
maxlog_hap_ockid(dad_hap[-missidx], mom_geno, mom_haps, kid_geno=kgeno[[ped1$kid[x]]][winidx[-missidx]])
}
}else{
mom_haps <- setup_mom_haps(temmom[, c("hap1", "hap2", "chunk")])
maxlog_hap_ockid(dad_hap, mom_geno, mom_haps, kid_geno=kgeno[[ped1$kid[x]]][winidx])
}
}else{ #unphased mom
mom_geno <- mymom[winidx]
het_idx <- which(mom_geno==1)
if(length(het_idx) > 0){
haps1 <- setup_haps(win_length=length(het_idx))
haps2 <- lapply(1:length(haps1), function(x) 1-haps1[[x]])
allhaps <- c(haps1, haps2)
mom_haps <- vector("list", 2^length(het_idx))
mom_haps <- lapply(1:length(mom_haps), function(x) {
temhap <- mom_geno/2
temhap[het_idx] <- allhaps[[x]]
return(temhap)})
}else{
mom_haps <- list(mom_geno/2)
}
maxlog_hap_ockid(dad_hap, mom_geno, mom_haps, kid_geno=kgeno[[ped1$kid[x]]][winidx])
}
})
}else{
maxlog1 = 0
}
### for selfed kids
ped2 <- subset(ped, p1 == p2)
if(nrow(ped2) > 0){
maxlog2 <- lapply(1:nrow(ped2), function(x) {
maxlog_hap_selfed_kid(haplotype=dad_hap, kid_geno=kgeno[[ped2$kid[x]]][winidx])
})
}else{
maxlog2=0
}
return(sum(unlist(maxlog1)) + sum(unlist(maxlog2)))
}
#' @rdname phase_chunk
#' @param temmom Must be a data.frame(hap1, hap2, chunk, idx)
setup_mom_haps <- function(temmom){
haps1 <- setup_haps(length(unique(temmom$chunk)))
haps2 <- lapply(1:length(haps1), function(x) 1-haps1[[x]])
allhaps <- c(haps1, haps2)
mom_haps <- lapply(1:length(allhaps), function(x){
temout <- c()
k = 1
for(c in unique(temmom$chunk)){
temout <- c(temout, temmom[temmom$chunk==c, allhaps[[x]][k]+1])
k <- k+1
}
return(temout)
})
return(mom_haps)
}
#' @rdname phase_chunk
maxlog_hap_ockid <- function(dad_hap, mom_geno, mom_haps, kid_geno){
# Find most likely phase of the dad for a kid at a window, return that probability
# give this dad haplotype, mom genotype (thus all possible haplotypes)
# and a kid's diploid genotype over the window and returns maximum prob
# Mendel is taken care of in the probs[[]] matrix already.
### all possible genotypes of a kid
allgeno1 <- lapply(1:length(mom_haps), function(x) dad_hap + mom_haps[[x]])
allgeno2 <- lapply(1:length(mom_haps), function(x) (1-dad_hap) + mom_haps[[x]])
allgenos <- c(allgeno1, allgeno2)
### return the possibility of the max genotype
geno_probs <- lapply(1:length(allgenos), function(geno){
sum( unlist(lapply(1:length(dad_hap), function(zz) {
#log(probs[[2]] is the log prob. of kid's obs geno
#given the current phased geno and given dad is het. (which is why probs[[2]])
tem <- probs[[2]][[ mom_geno[zz]+1 ]][allgenos[[geno]][zz]+1, kid_geno[zz]+1]
return(log(tem))
})))
})
return(max(unlist(geno_probs)))
}
#' @rdname phase_chunk
maxlog_hap_selfed_kid <- function(haplotype, kid_geno){
three_genotypes=list()
haplotype=unlist(haplotype)
three_genotypes[[1]]=haplotype+haplotype
three_genotypes[[2]]=haplotype+(1-haplotype)
three_genotypes[[3]]=(1-haplotype)+(1-haplotype)
geno_probs=as.numeric() #prob of each of three genotypes
for(geno in 1:3){
#log(probs[[2]][three_genotypes,kidwin] is the log prob. of kid's obs geno
#given the current phased geno and given mom is het. (which is why probs[[2]])
geno_probs[geno]=sum( sapply(1:length(haplotype), function(zz)
log( probs[[2]][[2]][three_genotypes[[geno]][zz]+1, kid_geno[zz]+1])))
}
### may introduce error
#if(length(which(geno_probs==max(geno_probs)))!=1){recover()}
return(max(geno_probs))
}
#' @rdname phase_chunk
jump_win <- function(GBS.array, winstart, win_length, hetsites, haps){
### jump to next window
if(length(hetsites) > (winstart + win_length - 1)){
winidx <- hetsites[winstart:(winstart + win_length - 1)]
win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=FALSE)
}else{
winidx <- hetsites[winstart:length(hetsites)]
mom_haps_tem <- setup_haps(win_length=length(winstart:length(hetsites)))
win_hap <- infer_dip(GBS.array, winidx, haps=mom_haps_tem, returnhap=TRUE)
}
return(win_hap)
}
#'
#' \code{Joining phased haplotype chunks. }
#'
#' Extending/joining the phased haplotype chunks generated from \code{phase_chunk}.
#'
#' @param GBS.array A GBS.array object generated from \code{impute_parent} or\code{sim.array} functions.
#' @param chunklist A list of haplotype chunks generated from \code{phase_chunk}.
#' @param join_length The length of each neighboring chunks used to connect them into a longer one.
#' @param verbose Writing verbose messages. Default=TRUE.
#'
#' @return return A list of haplotype chunks.
#'
#' @examples
#' GBS.array <- sim.array(size.array=50, numloci=1000)
#' win_length = 10
#' haps <- setup_haps(win_length)
#' probs <- error_mx(hom.error=0.02, het.error=0.8, imiss=0.2)
#' chunklist <- phase_chunk(GBS.array, win_length, haps, verbose=TRUE)
#' res <- join_chunks(GBS.array, chunklist, join_length, verbose=TRUE)
#'
join_chunks <- function(GBS.array, chunklist, join_length, verbose, OR){
outhaplist <- list(list())
outhaplist[[1]] <- chunklist[[1]] ### store the extended haps: hap1, hap2 and idx
i <- 1
for(chunki in 2:length(chunklist)){
if(verbose){ message(sprintf("###>>> join chunks [ %s and %s, total:%s] ...",
chunki-1, chunki, length(chunklist))) }
# join two neighbor haplotype chunks
oldchunk <- chunklist[[chunki-1]]
newchunk <- chunklist[[chunki]]
hapidx <- c(oldchunk[3], newchunk[3])
#dad_haps <- list(c(oldchunk[[1]], newchunk[[1]]), c(oldchunk[[1]], newchunk[[2]]))
dad_haps_lofl <- list(list(oldchunk[[1]], newchunk[[1]]),
list(oldchunk[[1]], newchunk[[2]]))
## link previous and current chunks
temhap <- link_dad_haps(GBS.array, dad_haps_lofl, hapidx, join_length, OR)
temhap <- c(temhap[[1]], temhap[[2]])
if(!is.null(temhap)){
outold <- outhaplist[[i]][[1]]
outoldchunk <- outold[(length(outold)-length(oldchunk[[1]])+1):length(outold)]
outnewchunk <- temhap[(length(oldchunk[[1]])+1):length(temhap)]
same <- sum(outoldchunk == temhap[1:length(oldchunk[[1]])])
#same <- sum(mom_phase1[(length(mom_phase1)-8):length(mom_phase1)] == win_hap[1:length(win_hap)-1])
outhaplist[[i]][[3]] <- c(outhaplist[[i]][[3]], newchunk[[3]])
if(same == 0){ #totally opposite phase of last window
#hap2[length(mom_phase2)+1] <- win_hap[length(win_hap)]
outhaplist[[i]][[1]] <- c(outhaplist[[i]][[1]], 1-outnewchunk)
outhaplist[[i]][[2]] <- c(outhaplist[[i]][[2]], outnewchunk)
} else if(same== length(oldchunk[[1]]) ){ #same phase as last window
#mom_phase1[length(mom_phase1)+1] <- win_hap[length(win_hap)]
outhaplist[[i]][[1]] <- c(outhaplist[[i]][[1]], outnewchunk)
outhaplist[[i]][[2]] <- c(outhaplist[[i]][[2]], 1-outnewchunk)
} else{
stop(">>> Extending error !!!")
}
} else {
i <- i +1
outhaplist[[i]] <- chunklist[[chunki]]
}
}
return(outhaplist)
}
#' @rdname join_chunks
#'
link_dad_haps <- function(GBS.array, dad_haps_lofl, hapidx, join_length, OR){
### hapidx: a list of idx [[1]] chunk0; [[2]]chunk1
### limit the join len to reduce computational burden
if(length(hapidx[[1]]) > join_length ){
upidx <- (length(hapidx[[1]])-join_length+1):length(hapidx[[1]])
}else{
upidx <- 1:length(hapidx[[1]])
}
if(length(hapidx[[2]]) > join_length ){
downidx <- 1:join_length
}else{
downidx <- 1:length(hapidx[[2]])
}
### get phase probs for each dad_haps
phase_probs <- lapply(1:length(dad_haps_lofl), function(a) {
dad_hap <- c(dad_haps_lofl[[a]][[1]][upidx], dad_haps_lofl[[a]][[2]][downidx])
winidx <- c(hapidx[[1]][upidx], hapidx[[2]][downidx])
sum_max_log_1hap(GBS.array, winidx, dad_hap)
} )
phase_probs <- unlist(phase_probs)
#if multiple haps tie, return two un-phased haps
if(length(which(phase_probs==max(phase_probs)))>1){
return(NULL)
}else{
v <- phase_probs
n <- length(v)
t <- max(v) - sort(v, partial=n-1)[n-1]
if(t < OR){
return(NULL)
}else{
return(dad_haps_lofl[[which.max(phase_probs)]])
}
}
}
############################################################################
# Setup all possible haplotypes for window of X heterozgous sites
# This needs to be fixed to remove redundancy. E.g. 010 is the same as 101 and 1010 is same as 0101.
# I don't think should bias things in the meantime, just be slow.
setup_haps <- function(win_length){
if(win_length <= 20){
alist <- lapply(1:win_length, function(a) c(0,1) )
### give a combination of all 0,1 into a data.frame
hapdf <- expand.grid(alist)[1:2^(win_length-1),]
### split the data.frame into a list
return(as.list(as.data.frame(t(hapdf))))
}else{
stop("!!! Can not handle [win_length > 20] !")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.