R/NewPhaseParent.R

Defines functions phase_parent output_phase phase_error_rate phase_chunk infer_dip sum_max_log_1hap setup_mom_haps maxlog_hap_ockid maxlog_hap_selfed_kid jump_win join_chunks link_dad_haps setup_haps

Documented in infer_dip join_chunks jump_win link_dad_haps maxlog_hap_ockid maxlog_hap_selfed_kid output_phase phase_chunk phase_error_rate phase_parent setup_mom_haps sum_max_log_1hap

#' \code{Phasing haplotypes of focal parent}
#'
#' Phasing focal parent's haplotypes using parents' genotypic data and progeny array's genotypic data.
#' Parents' genotypic data can be either confident genotyping data, i.e., WGS data, or 
#' imputed genotypic data generated by \code{impute_parent}. This function was composed of two major parts: 
#' \code{phase_chunk} and \code{join_chunks}. The haplotype was phased using a sliding window and extended to 
#' a large chunk of sequence. The chunks were then joined into chromosomal wide haplotypes.
#'
#' @param GBS.array A GBS.array object generated from \code{impute_parent} or \code{sim.array} functions. 
#' The slot \code{gbs_parents} replaced with imputed parental genotypes.
#' @param win_length Window length used for halotype phasing. Default=10. 
#' win_length > 20 will dramatically increase computational burden. 
#' @param join_length The length of each neighboring chunks used to connect them into a longer one.
#' @param verbose Writing verbose messages. Default=TRUE.
#' 
#' @return return a data.frame with all the log likelihoods. Using function \code{momgeno} to extract mom's genotype. 
#'   
#'   See \url{https://github.com/yangjl/imputeR/blob/master/vignettes/imputeR-vignette.pdf} for more details.
#'   
#' @examples
#' 
#' GBS.array <- sim.array(size.array=50, numloci=1000, imiss=0.2, selfing=0.5)
#' 
#' 
phase_parent <- function(GBS.array, win_length=10, join_length=10, verbose=TRUE, OR=log(3)){

    #### phasing chunks
    ped <- GBS.array@pedigree
    hetsites <- which(GBS.array@gbs_parents[[ped$p1[1]]]==1)
    if(verbose){ message(sprintf("###>>> start to phase hap chunks with [ %s ] hetsites ...",
                                 length(hetsites))) }
    haps <- setup_haps(win_length) 
    chunklist <- phase_chunk(GBS.array, win_length, haps, verbose)
    
    #### joining chunks
    if(join_length > 0 & join_length <= win_length){
        if(verbose){ message(sprintf("###>>> start to join hap chunks ...")) } 
        if(length(chunklist) > 1){
            out <- join_chunks(GBS.array, chunklist, verbose, join_length, OR)
            if(verbose){ message(sprintf("###>>> Reduced chunks from [ %s ] to [ %s ]", 
                                         length(chunklist), length(out))) } 
            chunklist <- out
        }else{
            if(verbose){ message(sprintf("###>>> Only one chunk in total!")) } 
        }
    }
    #### return data.frame
    out <- output_phase(chunklist, info=GBS.array@snpinfo)
    if(verbose){ 
        
        message(sprintf("###>>> phased [ %s percentage (%s/%s) ] of the heterozygote sites", 
           round(nrow(out)/length(hetsites),3)*100, nrow(out), length(hetsites) )) } 
    return(out)
}

#' @rdname phase_parent
output_phase <- function(outchunks, info){
    momdf <- data.frame()
    for(i in 1:length(outchunks)){
        tem <- data.frame(chunk=i, idx=outchunks[[i]][[3]], snpid=info$snpid[outchunks[[i]][[3]]],
                          hap1=outchunks[[i]][[1]], hap2=outchunks[[i]][[2]])
        momdf <- rbind(momdf, tem)
    }
    return(momdf)
}

#' @rdname phase_parent
phase_error_rate <- function(GBS.array, phase){
    ped <- GBS.array@pedigree
    if(length(unique(ped$p1)) != 1){
        stop("### more than one focal parent!!!")
    }
    pidx <- unique(ped$p1)
    
    true_p <- GBS.array@true_parents[[pidx]]
    p <- subset(true_p, hap1 != hap2)
    out <- merge(phase, p, by.x="idx", by.y="row.names", all=TRUE)
    #idx <- which.max(c(cor(out$hap1.x, out$hap1.y), cor(out$hap1.x, out$hap2.y)))
    #er <- sum(out$hap1.x != out[,4+idx])/nrow(out)
    #message(sprintf("###>>> phasing error rate [ %s ] for [ %s ] heterozygote sites.", 
    #                round(er,3), nrow(out)))
    out[is.na(out)] <- -9
    return(out)
}

#' 
#' \code{Phasing haplotype chunks. } 
#'
#' Phasing Parent's haplotypes using sliding window. 
#' These sliding windows were merged into a chunk if there is no conflict. Otherwise, they will be
#' assigned into different chunks.
#' 
#' @param GBS.array A GBS.array object generated from \code{impute_parent} or\code{sim.array} functions.
#' @param win_length A list of vectors of Kid's GBS data. Coded with 0, 1 and 2, which is the copy of alternative alleles. 
#' Missing data should be coded with 3.
#' @param haps all possible haplotypes (2^(win_length-1)) for a given window length.
#' @param verbose Writing verbose messages. Default=TRUE.
#' 
#' @return return A list of haplotype chunks.
#' 
#' @examples
#' GBS.array <- sim.array(size.array=50, numloci=1000)
#' win_length = 10
#' haps <- setup_haps(win_length)
#' probs <- error_mx(hom.error=0.02, het.error=0.8, imiss=0.2) 
#' chunklist <- phase_chunk(GBS.array, win_length, haps, verbose=TRUE)
#' 
phase_chunk <- function(GBS.array, win_length, haps, verbose, OR){
    
    ped <- GBS.array@pedigree
    fidx <- unique(ped$p1)
    if(length(fidx) == 1){
        hetsites <- which(GBS.array@gbs_parents[[fidx]]==1)
    }else{
        stop("!!! error! more than one focal parent in pedigree !!!")
    }
    
    # gets all possible haplotypes for X hets 
    haplist <- list()
    wds <- round(length(hetsites)/win_length, 0)
    pb <- txtProgressBar(min = 1, max = wds, style = 3)
    #for(wdi in 1:(wds-1)){
    #    winidx <- hetsites[(win_length*(wdi-1)+1):(win_length*wdi)]
    #    win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
    #    haplist[[wdi]] <- list(win_hap, 1-win_hap, winidx)
    #    if(verbose){ setTxtProgressBar(pb, wdi) } 
    #}
    ### try to speed up a little bit
    haplist <- lapply(1:(wds-1), function(wdi){
        winidx <- hetsites[(win_length*(wdi-1)+1):(win_length*wdi)]
        win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
        if(verbose){ setTxtProgressBar(pb, wdi) }
        return(list(win_hap, 1-win_hap, winidx))
    })
    
    ### phase last window
    winidx <- hetsites[(win_length*(wds-1)+1):length(hetsites)]
    if(length(winidx) >= (win_length + 5)){
        
        winidx <- hetsites[(win_length*(wds-1)+1):(win_length*wds)]
        haps <- setup_haps(win_length)
        win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
        haplist[[wds]] <- list(win_hap, 1-win_hap, winidx)
        ### last bit
        winidx <- hetsites[(win_length*wds+1):length(hetsites)]
        haps <- setup_haps(win_length=length(winidx))
        win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
        haplist[[wds+1]] <- list(win_hap, 1-win_hap, winidx)
    }else{
        haps <- setup_haps(win_length=length(winidx)) 
        win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=TRUE)
        haplist[[wds]] <- list(win_hap, 1-win_hap, winidx)
    }
    if(verbose){ setTxtProgressBar(pb, wds) }
    close(pb)
    ## list: hap1, hap2 and idx; info
    return(haplist)
}

#' @rdname phase_chunk
infer_dip <- function(GBS.array, winidx, haps, returnhap=FALSE){  
    # momwin is list of heterozygous sites, progeny list of kids genotypes, 
    # haps list of possible haps,momphase1 is current phased mom for use in splitting ties
    #### function for running one hap ####
    phase_probs <- unlist(lapply(1:length(haps), function(a) {
        sum_max_log_1hap(GBS.array, winidx, dad_hap=haps[[a]])
        #print(a)
        }))
    #if multiple haps tie, check each against current phase and return one with smallest distance
    if(length(which(phase_probs==max(phase_probs)))>1){
        if(returnhap){
            return(haps[[sample(which(phase_probs==max(phase_probs)), 1)]])
        } else{
            return(NULL)
        }
    }else{
        return(haps[[which.max(phase_probs)]])
    } 
}

# log likelyhood of one focal parent's hap x all other haps for all kids
#' @rdname phase_chunk
sum_max_log_1hap <- function(GBS.array, winidx, dad_hap=haps[[a]]){
    
    ped <- GBS.array@pedigree
    pgeno <- GBS.array@gbs_parents
    kgeno <- GBS.array@gbs_kids
    
    ### for outcrossed kids: 
    ped1 <- subset(ped, p1 != p2)
    if(nrow(ped1) >0){
        maxlog1 <- lapply(1:nrow(ped1), function(x) {
            mymom <- pgeno[[ped1$p2[x]]]
            if(!is.null(nrow(mymom))){ #phased mom
                temmom <- mymom[winidx, ]
                mom_geno <- temmom$hap1 + temmom$hap2
                missidx <- which(mom_geno > 2)
                if(length(missidx) >0){
                    mom_geno <- mom_geno[-missidx]
                    if(length(mom_geno) == 0){
                        maxlog1 = 0
                    }else{
                        mom_haps <- setup_mom_haps(temmom[-missidx, c("hap1", "hap2", "chunk")])
                        maxlog_hap_ockid(dad_hap[-missidx], mom_geno, mom_haps, kid_geno=kgeno[[ped1$kid[x]]][winidx[-missidx]])
                    }
                }else{
                    mom_haps <- setup_mom_haps(temmom[, c("hap1", "hap2", "chunk")])
                    maxlog_hap_ockid(dad_hap, mom_geno, mom_haps, kid_geno=kgeno[[ped1$kid[x]]][winidx])
                }
            }else{ #unphased mom
                mom_geno <- mymom[winidx]
                het_idx <- which(mom_geno==1)
                if(length(het_idx) > 0){
                    haps1 <- setup_haps(win_length=length(het_idx))
                    haps2 <- lapply(1:length(haps1), function(x) 1-haps1[[x]])
                    allhaps <- c(haps1, haps2)
                    mom_haps <- vector("list", 2^length(het_idx))
                    mom_haps <- lapply(1:length(mom_haps), function(x) {
                        temhap <- mom_geno/2
                        temhap[het_idx] <- allhaps[[x]]
                        return(temhap)})
                }else{
                    mom_haps <- list(mom_geno/2)
                }
                maxlog_hap_ockid(dad_hap, mom_geno, mom_haps, kid_geno=kgeno[[ped1$kid[x]]][winidx])
            }
        })
    }else{
        maxlog1 = 0
    }
    ### for selfed kids
    ped2 <- subset(ped, p1 == p2)
    if(nrow(ped2) > 0){
        maxlog2 <- lapply(1:nrow(ped2), function(x) {
            maxlog_hap_selfed_kid(haplotype=dad_hap, kid_geno=kgeno[[ped2$kid[x]]][winidx])
        }) 
    }else{
        maxlog2=0
    }
    return(sum(unlist(maxlog1)) + sum(unlist(maxlog2)))
} 

#' @rdname phase_chunk
#' @param temmom Must be a data.frame(hap1, hap2, chunk, idx)
setup_mom_haps <- function(temmom){
    haps1 <- setup_haps(length(unique(temmom$chunk)))
    haps2 <- lapply(1:length(haps1), function(x) 1-haps1[[x]])
    allhaps <- c(haps1, haps2)
    mom_haps <- lapply(1:length(allhaps), function(x){
        temout <- c()
        k = 1
        for(c in unique(temmom$chunk)){
            temout <- c(temout, temmom[temmom$chunk==c, allhaps[[x]][k]+1])
            k <- k+1
        }
        return(temout)    
    })
    return(mom_haps)
}

#' @rdname phase_chunk
maxlog_hap_ockid <- function(dad_hap, mom_geno, mom_haps, kid_geno){
    # Find most likely phase of the dad for a kid at a window, return that probability
    # give this dad haplotype, mom genotype (thus all possible haplotypes) 
    # and a kid's diploid genotype over the window and returns maximum prob
    # Mendel is taken care of in the probs[[]] matrix already. 
    ### all possible genotypes of a kid
    allgeno1 <- lapply(1:length(mom_haps), function(x) dad_hap + mom_haps[[x]])
    allgeno2 <- lapply(1:length(mom_haps), function(x) (1-dad_hap) + mom_haps[[x]])
    allgenos <- c(allgeno1, allgeno2)
    
    ### return the possibility of the max genotype
    geno_probs <- lapply(1:length(allgenos), function(geno){
        sum( unlist(lapply(1:length(dad_hap), function(zz) {
            #log(probs[[2]] is the log prob. of kid's obs geno 
            #given the current phased geno and given dad is het. (which is why probs[[2]])
            tem <- probs[[2]][[ mom_geno[zz]+1 ]][allgenos[[geno]][zz]+1, kid_geno[zz]+1]
            return(log(tem))
        })))   
    })
    return(max(unlist(geno_probs)))
}

#' @rdname phase_chunk
maxlog_hap_selfed_kid <- function(haplotype, kid_geno){
    three_genotypes=list()
    haplotype=unlist(haplotype)
    three_genotypes[[1]]=haplotype+haplotype
    three_genotypes[[2]]=haplotype+(1-haplotype)
    three_genotypes[[3]]=(1-haplotype)+(1-haplotype)
    geno_probs=as.numeric() #prob of each of three genotypes
    for(geno in 1:3){
        #log(probs[[2]][three_genotypes,kidwin] is the log prob. of kid's obs geno 
        #given the current phased geno and given mom is het. (which is why probs[[2]])
        geno_probs[geno]=sum( sapply(1:length(haplotype), function(zz) 
            log( probs[[2]][[2]][three_genotypes[[geno]][zz]+1, kid_geno[zz]+1])))
    }
    ### may introduce error
    #if(length(which(geno_probs==max(geno_probs)))!=1){recover()}
    return(max(geno_probs))
}

#' @rdname phase_chunk
jump_win <- function(GBS.array, winstart, win_length, hetsites, haps){
    ### jump to next window
    if(length(hetsites) > (winstart + win_length - 1)){
        winidx <- hetsites[winstart:(winstart + win_length - 1)]
        win_hap <- infer_dip(GBS.array, winidx, haps, returnhap=FALSE)
    }else{
        winidx <- hetsites[winstart:length(hetsites)]
        mom_haps_tem <- setup_haps(win_length=length(winstart:length(hetsites)))
        win_hap <- infer_dip(GBS.array, winidx, haps=mom_haps_tem, returnhap=TRUE)    
    }
    return(win_hap)
}

#'
#' \code{Joining phased haplotype chunks. } 
#'
#' Extending/joining the phased haplotype chunks generated from \code{phase_chunk}.
#'
#' @param GBS.array A GBS.array object generated from \code{impute_parent} or\code{sim.array} functions.
#' @param chunklist A list of haplotype chunks generated from \code{phase_chunk}.
#' @param join_length The length of each neighboring chunks used to connect them into a longer one.
#' @param verbose Writing verbose messages. Default=TRUE.
#' 
#' @return return A list of haplotype chunks.
#' 
#' @examples
#' GBS.array <- sim.array(size.array=50, numloci=1000)
#' win_length = 10
#' haps <- setup_haps(win_length)
#' probs <- error_mx(hom.error=0.02, het.error=0.8, imiss=0.2) 
#' chunklist <- phase_chunk(GBS.array, win_length, haps, verbose=TRUE)
#' res <- join_chunks(GBS.array, chunklist, join_length, verbose=TRUE)
#' 
join_chunks <- function(GBS.array, chunklist, join_length, verbose, OR){
    outhaplist <- list(list())
    outhaplist[[1]] <- chunklist[[1]] ### store the extended haps: hap1, hap2 and idx
    i <- 1
    for(chunki in 2:length(chunklist)){
        if(verbose){ message(sprintf("###>>> join chunks [ %s and %s, total:%s] ...", 
                                     chunki-1, chunki, length(chunklist))) }
        # join two neighbor haplotype chunks
        oldchunk <- chunklist[[chunki-1]]
        newchunk <- chunklist[[chunki]]
        hapidx <- c(oldchunk[3], newchunk[3])
        #dad_haps <- list(c(oldchunk[[1]], newchunk[[1]]), c(oldchunk[[1]], newchunk[[2]]))
        dad_haps_lofl <- list(list(oldchunk[[1]], newchunk[[1]]), 
                              list(oldchunk[[1]], newchunk[[2]]))
        
        ## link previous and current chunks
        temhap <- link_dad_haps(GBS.array, dad_haps_lofl, hapidx, join_length, OR)
        temhap <- c(temhap[[1]], temhap[[2]])
        if(!is.null(temhap)){
            outold <- outhaplist[[i]][[1]]
            outoldchunk <- outold[(length(outold)-length(oldchunk[[1]])+1):length(outold)]
            outnewchunk <- temhap[(length(oldchunk[[1]])+1):length(temhap)]
            
            same <- sum(outoldchunk == temhap[1:length(oldchunk[[1]])])
            #same <- sum(mom_phase1[(length(mom_phase1)-8):length(mom_phase1)] == win_hap[1:length(win_hap)-1])
            outhaplist[[i]][[3]] <- c(outhaplist[[i]][[3]], newchunk[[3]])
            if(same == 0){ #totally opposite phase of last window
                #hap2[length(mom_phase2)+1] <- win_hap[length(win_hap)] 
                outhaplist[[i]][[1]] <- c(outhaplist[[i]][[1]], 1-outnewchunk)
                outhaplist[[i]][[2]] <- c(outhaplist[[i]][[2]], outnewchunk)
                
            } else if(same== length(oldchunk[[1]]) ){ #same phase as last window
                #mom_phase1[length(mom_phase1)+1] <- win_hap[length(win_hap)]
                outhaplist[[i]][[1]] <- c(outhaplist[[i]][[1]], outnewchunk)
                outhaplist[[i]][[2]] <- c(outhaplist[[i]][[2]], 1-outnewchunk)
            } else{
                stop(">>> Extending error !!!")
            }
        } else {
            i <- i +1
            outhaplist[[i]] <- chunklist[[chunki]]
        } 
    }
    return(outhaplist)
}

#' @rdname join_chunks
#' 
link_dad_haps <- function(GBS.array, dad_haps_lofl, hapidx, join_length, OR){
    ### hapidx: a list of idx [[1]] chunk0; [[2]]chunk1
    
    ### limit the join len to reduce computational burden
    if(length(hapidx[[1]]) > join_length ){
        upidx <- (length(hapidx[[1]])-join_length+1):length(hapidx[[1]])
    }else{
        upidx <- 1:length(hapidx[[1]])
    }
    if(length(hapidx[[2]]) > join_length ){
        downidx <- 1:join_length
    }else{
        downidx <- 1:length(hapidx[[2]])
    }
    
    ### get phase probs for each dad_haps
    phase_probs <- lapply(1:length(dad_haps_lofl), function(a) {
        dad_hap <- c(dad_haps_lofl[[a]][[1]][upidx], dad_haps_lofl[[a]][[2]][downidx])
        winidx <- c(hapidx[[1]][upidx], hapidx[[2]][downidx])   
        sum_max_log_1hap(GBS.array, winidx, dad_hap) 
    } )
    phase_probs <- unlist(phase_probs)
    #if multiple haps tie, return two un-phased haps
     
    if(length(which(phase_probs==max(phase_probs)))>1){
        return(NULL)
    }else{
        v <- phase_probs
        n <- length(v)
        t <- max(v) - sort(v, partial=n-1)[n-1]
        if(t < OR){
            return(NULL)
        }else{
            return(dad_haps_lofl[[which.max(phase_probs)]]) 
        }
    } 
}
############################################################################
# Setup all possible haplotypes for window of X heterozgous sites
# This needs to be fixed to remove redundancy. E.g. 010 is the same as 101 and 1010 is same as 0101. 
# I don't think should bias things in the meantime, just be slow.
setup_haps <- function(win_length){
    if(win_length <= 20){
        alist <- lapply(1:win_length, function(a) c(0,1) )
        ### give a combination of all 0,1 into a data.frame
        hapdf <- expand.grid(alist)[1:2^(win_length-1),]
        ### split the data.frame into a list
        return(as.list(as.data.frame(t(hapdf)))) 
    }else{
        stop("!!! Can not handle [win_length > 20] !")
    }   
}
yangjl/imputeR documentation built on May 4, 2019, 2:28 p.m.