R/updog_genotype.R

Defines functions plot_error_dist updog_genotype

Documented in plot_error_dist updog_genotype

#' OneMap interface with updog package
#'
#' Uses alelle counts to reestimate genotypes with updog approach and 
#' stores the genotypes probabilities or for further multipoint 
#' analysis
#' 
#' @param vcfR.object object output of the vcfR package
#' @param onemap.object object of class onemap 
#' @param vcf.par Field of VCF that informs the depth of alleles
#' @param f1 f1 individual identification if f2 cross type
#' @param recovering logical defining if markers should be recovered from VCF
#' @param mean_phred the mean phred score of the sequencer technology
#' @param cores number of threads 
#' @param depths list containing a matrix for ref and other for alt allele counts, samples ID in colum and markers ID in rows
#' @param parent1 parent 1 identification in vcfR object
#' @param parent2 parent 2 identification in vcfR objetc
#' @param global_error number from 0 to 1 defining the global error to be considered together 
#' with the genotype errors or the genotype probabilities or NULL to not considered any global error
#' @param use_genotypes_errors if \code{TRUE} the error probability of each genotype will be considered in emission function of HMM
#' @param use_genotype_probs if \code{TRUE} the probability of each possible genotype will be considered in emission function of HMM
#' @param rm_multiallelic if \code{TRUE} multiallelic markers will be removed from the output onemap object 
#' 
#' @return onemap object with genotypes updated 
#' 
#' @author Cristiane Taniguti, \email{chtaniguti@@usp.br} 
#' @seealso \code{\link[onemap]{extract_depth}} 
#'     \code{\link[onemap]{binom_genotype}} and 
#'     \url{https://github.com/dcgerard/updog}.
#'
#' @references 
#'
#' Gerard, D., Ferrão L.F.V., Garcia, A.A.F., & Stephens, M. (2018). Harnessing 
#' Empirical Bayes and Mendelian Segregation for Genotyping Autopolyploids from 
#' Messy Sequencing Data. bioRxiv. doi: 10.1101/281550.
#'
#' @import foreach doParallel updog
#'   
#' @export
updog_genotype <- function(vcfR.object=NULL,
                           onemap.object= NULL,
                           vcf.par = c("AD", "DPR"),
                           parent1="P1",
                           parent2="P2",
                           f1=NULL,
                           recovering = FALSE,
                           mean_phred = 20, 
                           cores = 2,
                           depths = NULL,
                           global_error = NULL,
                           use_genotypes_errors = TRUE,
                           use_genotypes_probs = FALSE,
                           rm_multiallelic = TRUE){
  
  # checks
  if(use_genotypes_errors & use_genotypes_probs){
    stop("You must choose only one of the offered approaches to be considered in emission function of HMM. `use_genotype_errors` or `use_genotype_probs`")
  } else if (!(use_genotypes_errors | use_genotypes_probs)){
    stop("You should choose one approach to be considered in emission function of HMM.`use_genotype_errors` or `use_genotype_probs`")
  }
  
  if(is.null(depths)){
    depth_matrix <- extract_depth(vcfR.object=vcfR.object,
                                  onemap.object=onemap.object,
                                  vcf.par=vcf.par,
                                  parent1=parent1,
                                  parent2=parent2,
                                  f1=f1,
                                  recovering=recovering)
  } else {
    p1 <- which(colnames(depths[[1]]) == parent1)
    p2 <- which(colnames(depths[[1]]) == parent2)
    if(is.null(f1)){
      palt <- depths[[2]][,c(p1,p2)]
      pref <- depths[[1]][,c(p1,p2)]
      oalt <- depths[[2]][,-c(p1,p2)]
      oref <- depths[[1]][,-c(p1,p2)]
      
      oalt <- oalt[,match(rownames(onemap.object$geno),colnames(oalt))]
      oref <- oref[,match(rownames(onemap.object$geno),colnames(oref))]
      
      psize <- palt + pref
      osize <- oalt + oref
      
      rownames(palt) <- rownames(pref) <- rownames(oalt)
      
    } else {
      f1i <- which(colnames(depths[[1]]) == f1)
      palt <- as.numeric(depths[[2]][,f1i])
      pref <- as.numeric(depths[[1]][,f1i])
      oalt <- depths[[2]][,-c(p1,p2,f1i)]
      oref <- depths[[1]][,-c(p1,p2,f1i)]
      
      oalt <- oalt[,match(rownames(onemap.object$geno),colnames(oalt))]
      oref <- oref[,match(rownames(onemap.object$geno),colnames(oref))]
      
      psize <- palt + pref
      osize <- oalt + oref
      
      names(palt) <- names(pref) <- rownames(oalt)
      
    }
    
    if(recovering==FALSE){
      palt <- palt[which(rownames(palt) %in% colnames(onemap.object$geno)),]
      pref <- pref[which(rownames(pref) %in% colnames(onemap.object$geno)),]
      psize <- psize[which(rownames(psize) %in% colnames(onemap.object$geno)),]
      oalt <- oalt[which(rownames(oalt) %in% colnames(onemap.object$geno)),]
      oref <- oref[which(rownames(oref) %in% colnames(onemap.object$geno)),]
      osize <- osize[which(rownames(osize) %in% colnames(onemap.object$geno)),]
    }
    
    depth_matrix <- list("palt"=palt, "pref"=pref, "psize"=psize, 
                         "oalt"=as.matrix(oalt), "oref"=as.matrix(oref), "osize"=as.matrix(osize), 
                         n.mks = dim(oalt)[1], n.ind = dim(oalt)[2],
                         inds = colnames(oalt), mks = rownames(oalt),
                         CHROM = sapply(strsplit(rownames(oalt), split="_"), "[",1),
                         POS = sapply(strsplit(rownames(oalt), split="_"), "[",2),
                         onemap.object = onemap.object)
  }
  
  # Extract list objects                                                                                         
  for(i in 1:length(depth_matrix)){
    tempobj=depth_matrix[[i]]
    eval(parse(text=paste(names(depth_matrix)[[i]],"= tempobj")))
  }
  
  onemap_updog <- onemap.object
  
  # removing the multiallelics from the vcf
  multi <- grepl(",",vcfR.object@fix[,5])
  multi.mks <- vcfR.object@fix[,3][multi]
  
  if(any(multi)){ 
    idx <- which(rownames(palt) %in% multi.mks)
    palt <- palt[-idx,]
    pref <- pref[-idx,]
    psize <- psize[-idx,]
    oalt <- oalt[-idx,]
    oref <- oref[-idx,]
    osize <- osize[-idx,]
    n.mks = dim(oalt)[1]
    mks = rownames(oalt)
    CHROM = CHROM[-idx]
    POS = POS[-idx]
    
    # removing the multiallelics from the onemap object
    if(is(onemap.object, "outcross")){
      idx.multi <- which(!(onemap.object$segr.type %in% c("B3.7", "D1.10", "D2.15")))
      mult.obj <- split_onemap(onemap.object, mks= idx.multi)
      idx.bi <- which(onemap.object$segr.type %in% c("B3.7", "D1.10", "D2.15"))
      onemap.object <- split_onemap(onemap.object, mks= idx.bi)
    } else if(is(onemap.object,"f2")){
      idx.multi <- which(!(onemap.object$segr.type %in% c("A.H.B")))
      mult.obj <- split_onemap(onemap.object, mks= idx.multi)
      idx.bi <- which(onemap.object$segr.type %in% c("A.H.B"))
      onemap.object <- split_onemap(onemap.object, mks= idx.bi)
    }
  }
  
  # Missing data                                                                                                 
  if(is(onemap.object, "f2")){
    rm.mks <- which(psize ==0)
  }else{
    rm.mks <- which(psize[,1] ==0 | psize[,2] ==0)
  }
  
  rm.mks1 <- vector()
  for(i in 1:n.mks){
    if(length(table(osize[i,]))==1)
      rm.mks1 <- c(rm.mks1,i)
  }
  rm.mks <- sort(unique(c(rm.mks, rm.mks1)))
  
  if(length(rm.mks) > 0){
    if(is(onemap.object, "f2")){
      psize <- psize[-rm.mks]
      pref <- pref[-rm.mks]
    } else {
      psize <- psize[-rm.mks,]
      pref <- pref[-rm.mks,]
    }
    osize <- osize[-rm.mks,]
    oref <- oref[-rm.mks,]
    n.mks <- n.mks - length(rm.mks)
    mks <- mks[-rm.mks]
    onemap_updog$CHROM <- CHROM[-rm.mks]
    onemap_updog$POS <- POS[-rm.mks]
  }
  
  idx <- which(osize==0)
  osize[idx] <- NA
  oref[idx] <- NA
  
  geno_matrix <- maxpostprob <- matrix(rep(NA,dim(osize)[2]*dim(osize)[1]),nrow=dim(osize)[1])
  P1 <- P2 <- rep(NA, n.mks)
  if(is(onemap.object, "f2")){
    cl <- parallel::makeCluster(cores)
    doParallel::registerDoParallel(cl = cl)
    gene_est <- foreach(i = 1:n.mks,
                        .combine = 'c',
                        .multicombine=TRUE,
                        .export = "flexdog") %dopar% {
                          fout <- flexdog(refvec  = oref[i,],
                                          sizevec = osize[i,],
                                          ploidy  = 2,
                                          p1ref = pref[i],
                                          p1size = psize[i],
                                          model = "s1")
                          list(fout)
                        }
    parallel::stopCluster(cl)
    
    P1 <- unlist(sapply(sapply(gene_est, "[", 8), "[", 1))
  } else if(is(onemap.object, "outcross")){
    cl <- parallel::makeCluster(cores)
    doParallel::registerDoParallel(cl = cl)
    gene_est <- foreach(i = 1:n.mks) %dopar% {
      ## fit flexdog                                                                                                                             
      fout <- updog::flexdog(refvec  = oref[i,],
                             sizevec = osize[i,],
                             ploidy  = 2,
                             p1ref = pref[i,2],
                             p1size = psize[i,2],
                             p2ref = pref[i,1],
                             p2size = psize[i,1],
                             model = "f1")
      fout
    }
    parallel::stopCluster(cl)
    P2 <- unlist(sapply(sapply(gene_est, "[", 8), "[", 1))
    P1 <- unlist(sapply(sapply(gene_est, "[", 8), "[", 2))
  }
  
  temp1 <- lapply(gene_est, "[[", 9)
  temp2 <- lapply(gene_est, "[[", 10)
  
  for(i in 1:n.mks){
    geno_matrix[i,] <- temp1[[i]]
    maxpostprob[i,] <- temp2[[i]]
  }
  
  postmat <- lapply(gene_est, "[", 6)
  
  genotypes_probs <- postmat[[1]]$postmat
  for(i in 2:length(postmat)) # Order: mk 1 1 1 ind 1 2 3
    genotypes_probs <- rbind(genotypes_probs, postmat[[i]]$postmat)
  
  # sort - order mk 1 2 3 ind 1 1 1 
  idx <- rep(1:dim(osize)[2], dim(osize)[1])
  genotypes_probs <- genotypes_probs[order(idx), ]
  
  if(is(onemap.object, "outcross")){
    rm.mk <- which(P1==0 & P2 == 2 | P2 == 0 & P1 == 0 | P1==2 & P2 == 2 | P1==2 & P2 ==0)
    
    if(length(rm.mk) > 0){
      cat("markers", length(mks[rm.mk]), "were reestimated as non-informative and removed of analysis \n")
      geno_matrix <- geno_matrix[-rm.mk,]
      maxpostprob <- maxpostprob[-rm.mk,]
      P1 <- P1[-rm.mk]
      P2 <- P2[-rm.mk]
      n.mks <- n.mks - length(rm.mk)
      mks <- mks[-rm.mk]
      genotypes_probs <- genotypes_probs[-c(rm.mk + rep(c(0:(dim(osize)[2]-1))*dim(osize)[1], each=length(rm.mk))),]
    }
    conv_geno <- matrix(rep(NA,dim(geno_matrix)[2]*dim(geno_matrix)[1]),nrow=dim(geno_matrix)[1])
    segr.type <- segr.type.num <- rep(NA, n.mks)
    conv_geno[which(geno_matrix==1)] <- 2
    # B3.7                                                                                                         
    idx <- which(P1==1 & P2==1)
    conv_geno[idx,][which(geno_matrix[idx,]==0)] <- 3
    conv_geno[idx,][which(geno_matrix[idx,]==2)] <- 1
    segr.type[idx] <- "B3.7"
    segr.type.num[idx] <- 4
    
    # D2.15                                                                                                        
    idx <- which(P1==0 & P2==1)
    conv_geno[idx,][which(geno_matrix[idx,]==0)] <- 1
    conv_geno[idx,][which(geno_matrix[idx,]==2)] <- 0
    segr.type[idx] <- "D2.15"
    segr.type.num[idx] <- 7
    
    idx <- which(P1==2 & P2==1)
    conv_geno[idx,][which(geno_matrix[idx,]==0)] <- 0
    conv_geno[idx,][which(geno_matrix[idx,]==2)] <- 1
    segr.type[idx] <- "D2.15"
    segr.type.num[idx] <- 7
    # D1.10                                                                                                        
    idx <- which(P1==1 & P2==0)
    conv_geno[idx,][which(geno_matrix[idx,]==0)] <- 1
    conv_geno[idx,][which(geno_matrix[idx,]==2)] <- 0
    segr.type[idx] <- "D1.10"
    segr.type.num[idx] <- 6
    
    idx <- which(P1==1 & P2==2)
    conv_geno[idx,][which(geno_matrix[idx,]==0)] <- 0
    conv_geno[idx,][which(geno_matrix[idx,]==2)] <- 1
    segr.type[idx] <- "D1.10"
    segr.type.num[idx] <- 6
    
  } else {
    rm.mk <- which(P1!=1)
    if(length(rm.mk) > 0){
      cat("markers", length(mks[rm.mk]), "were estimated as non-informative and removed of analysis \n")
      geno_matrix <- geno_matrix[-rm.mk,]
      maxpostprob <- maxpostprob[-rm.mk,]
      P1 <- P1[-rm.mk]
      P2 <- P2[-rm.mk]
      n.mks <- n.mks - length(rm.mk)
      mks <- mks[-rm.mk]
      genotypes_probs <- genotypes_probs[-c(rm.mk + rep(c(0:(dim(osize)[2]-1))*dim(osize)[1], each=length(rm.mk))),]
    }
    
    conv_geno <- matrix(rep(NA,dim(geno_matrix)[2]*dim(geno_matrix)[1]),nrow=dim(geno_matrix)[1])
    segr.type <- segr.type.num <- rep(NA, n.mks)
    conv_geno[which(geno_matrix==1)] <- 2
    
    # A.H.B                                                                                                        
    idx <- which(P1==1)
    conv_geno[idx,][which(geno_matrix[idx,]==0)] <- 3
    conv_geno[idx,][which(geno_matrix[idx,]==2)] <- 1
    segr.type[idx] <- "A.H.B"
    segr.type.num[idx] <- 1
  }
  
  conv_geno <-  t(conv_geno)
  conv_geno[which(is.na(conv_geno))] <- 0
  
  comp <- which(colnames(onemap.object$geno) %in% mks)
  onemap_updog$CHROM <- onemap.object$CHROM[comp]
  onemap_updog$POS <- onemap.object$POS[comp]
  
  comp1 <- which(depth_matrix$mks %in% mks)
  onemap_updog$CHROM <- depth_matrix$CHROM[comp1]
  onemap_updog$POS <- depth_matrix$POS[comp1]
  
  # If some marker in onemap object is now non-informative and didn't recover any marker                         
  if(length(colnames(onemap.object$geno)) - length(comp) > 0 & length(mks) == length(comp)){
    cat(length(colnames(onemap.object$geno)) - length(comp),
        "markers of original onemap object were considered non-informative after new SNP calling and were removed from analysis \n")
    cat("This approach changed", (sum(conv_geno != onemap.object$geno[,comp])/length(onemap.object$geno[,comp]))*100, "% of the genotypes\n")
    
    # If some marker in onemap object are now non-informative and some marker were recoved from vcf                
  } else if(length(colnames(onemap.object$geno)) - length(comp) > 0 & length(mks) > length(comp)){
    cat(length(mks) - length(colnames(onemap.object$geno)[comp]), "markers were recovered from vcf file and added to onemap object \n")
    comp1 <- which(mks %in% colnames(onemap.object$geno))
    cat("This approach changed", (sum(conv_geno[,comp1] != onemap.object$geno[,comp])/length(onemap.object$geno[,comp]))*100, "% of the genotypes \
n")
    
    # If any marker in onemap object were considered non-informative and some marker were recovered from vcf       
  } else if(length(colnames(onemap.object$geno)) - length(comp) == 0 & length(mks) > length(comp)){
    comp1 <- which(mks %in% colnames(onemap.object$geno))
    cat("This approach changed", (sum(conv_geno[,comp1] != onemap.object$geno)/length(onemap.object$geno))*100, "% of the genotypes from biallelic markers\n")
  } else{
    cat("This approach changed", (sum(conv_geno != onemap.object$geno)/length(onemap.object$geno))*100, "% of the genotypes from biallelic markers\n")
  }
  
  cat("New onemap object contains", length(mks), "biallelic markers\n")
  
  maxpostprob <- t(1- maxpostprob)
  colnames(conv_geno)  <- colnames(maxpostprob) <- mks
  inds <- rownames(conv_geno)  <- rownames(maxpostprob) <- rownames(onemap.object$geno)
  
  onemap_updog$geno <- conv_geno
  onemap_updog$n.ind <- length(inds)
  onemap_updog$n.mar <- length(mks)
  onemap_updog$segr.type.num <- segr.type.num
  onemap_updog$segr.type <- segr.type
  
  if(use_genotypes_probs){
    onemap_updog.new <- create_probs(onemap.obj = onemap_updog,
                                     genotypes_probs = genotypes_probs,
                                     global_error = global_error)
  } else if(use_genotypes_errors){
    onemap_updog.new <- create_probs(onemap.obj = onemap_updog,
                                     genotypes_errors = maxpostprob,
                                     global_error = global_error)
  } else if(!is.null(global_error)){
    onemap_updog.new <- create_probs(onemap.obj = onemap_updog,
                                     global_error = global_error)
  }
  
  if(!rm_multiallelic){
    if(length(multi.mks) > 0)
      onemap_updog.new <- combine_onemap(onemap_updog.new, mult.obj)
  }
  structure(onemap_updog.new)
}

##' Plot a barplot with error probabilities values
##' 
##' @param onemap.obj an object of class \code{onemap} coming from \code{read_onemap}, 
##' \code{read_mapmaker}, \code{onemap_read_vcfR}, \code{updog_genotype} functions
##' 
##' @param mk.type a TRUE/FALSE value to define if genotypes will colored by marker type
##' 
##' @param select.ind a string defining specific individuals. The graphic will only contains 
##' error probability information of this individuals.
##' 
##' @param select.mk a string defining specific markers. The graphic will only contains 
##' error probability information of this markers.
##' 
##' @param n.int
##' 
##' @examples 
##' 
##' \dontrun{
##' data(onemap_example_out)
##' p <- plot_error_dist(onemap_example_out)
##' ggplot2::ggsave("errors_out.jpg", p width=7, height=4, dpi=300)
##' }
##' @import ggplot2
##' 
##' @export
plot_error_dist <- function(onemap.obj = NULL, mk.type = TRUE, 
                            select.ind = NULL, select.mk = NULL, 
                            n.int= NULL){
  
  #Do checks
  M <- reshape2::melt(onemap.obj$error)
  M <- cbind(M, type = rep(onemap.obj$segr.type, each = onemap.obj$n.ind))
  
  if(!is.null(select.ind)){
    idx <- which(as.character(M$Var1) %in% select.ind)
    if(length(idx)==0){
      stop("This individual does not exist in the given onemap object\n")
    } else{ M <- M[idx,] }
  }
  
  if(!is.null(select.mk)){
    idx <- which(as.character(M$Var2) %in% select.mk)
    if(length(idx)==0){
      stop("This marker does not exist in the given onemap object\n")
    }
    M <- M[idx,]
  }
  
  if(is.null(n.int)){
    if(nclass.FD(M$value) <= 10000000){
      breaks <- pretty(range(M$value), n = nclass.FD(M$value), min.n = 1)
      binwidth <- breaks[2]-breaks[1]
    } else {
      stop("Choose a n.int value for your graphic\n")
    }
  } else {
    breaks <- pretty(range(M$value), n = n.int, min.n = 1)
    binwidth <- breaks[2]-breaks[1]
  }
  
  if(mk.type){
    p <- ggplot(M, aes(value, fill = as.factor(type))) +
      geom_histogram(binwidth = binwidth)
  } else {
    p <- ggplot(M, aes(value)) +
      geom_histogram(binwidth = binwidth)
  }
  
  p <- p +  xlab("error probability") + ylab("count") + labs(fill = "Marker type") + scale_x_continuous(limits = c(0, max(M$value)))
  return(p)
}
Cristianetaniguti/genotyping4onemap documentation built on Aug. 26, 2020, 10:32 a.m.