R/write_haplotypes.R

Defines functions plot.onemap_progeny_haplotypes progeny_haplotypes parents_haplotypes

Documented in parents_haplotypes plot.onemap_progeny_haplotypes progeny_haplotypes

#######################################################################
##                                                                     ##
## Package: onemap                                                     ##
##                                                                     ##
## File: write_haplotypes.R                                            ##
## Contains: parents_haplotypes, progeny_haplotypes,                   ##
## plot.onemap_progeny_haplotypes,                                     ##
## plot.onemap_progeny_haplotypes_counts                               ##
##                                                                     ##
## Written by  Getulio Caixeta Ferreira and Cristiane Taniguti         ##
##                                                                     ##
## First version: 2020/05/26                                           ##
## License: GNU General Public License version 3 or later              ##
##                                                                     ##
#######################################################################

globalVariables(c("grp", "for.split", ".", "pos", "prob", "pos2", 
                  "allele","parents.homologs", "progeny.homologs", "homolog"))
globalVariables(c("V1", "V2", "V3", "V4", 
                  "H1_P1", "H1_P2", "H2_P1", "H2_P2",
                  "P1_H1", "P1_H2", "P2_H1", "P2_H2"))

#' Generates data.frame with parents estimated haplotypes 
#'
#' @param ... objects of class sequence
#' @param group_names vector of characters defining the group names
#'
#' @return data.frame with group ID (group), marker number (mk.number) 
#' and names (mk.names), position in centimorgan (dist) and parents haplotypes 
#' (P1_1, P1_2, P2_1, P2_2)
#' 
#' @examples 
#' \donttest{
#' data("onemap_example_out")
#' twopts <- rf_2pts(onemap_example_out)
#' lg1 <- make_seq(twopts, 1:5)
#' lg1.map <- map(lg1)
#' parents_haplotypes(lg1.map)
#' }
#' @author Getulio Caixeta Ferreira, \email{getulio.caifer@@gmail.com}
#' @author Cristiane Taniguti, \email{chtaniguti@@tamu.edu}
#' @export
parents_haplotypes <- function(..., group_names=NULL){
  input <- list(...)
  if(length(input) == 0) stop("argument '...' missing, with no default")
  # Accept list of sequences or list of list of sequences
  if(inherits(input[[1]], "sequence")) input.map <- input else input.map <- unlist(input, recursive = FALSE)
  if(!all(sapply(input.map, function(x) inherits(x, "sequence")))) stop(paste("Input objects must be of 'sequence' class. \n"))
  if(is.null(group_names)) group_names <- paste("Group",seq(input.map), sep = " - ")
  
  if(all(sapply(input, function(x) inherits(x, "sequence")))){
    n <- length(sapply(input, function(x) inherits(x, "sequence")))
  } else n <- 1
  
  input_temp <- input
  out_dat <- data.frame()
  for(z in 1:n){
    if(all(sapply(input_temp, function(x) inherits(x, "sequence")))) input <- input_temp[[z]]
    marnames <- colnames(input$data.name$geno)[input$seq.num]
    if(length(input$seq.rf) == 1 && input$seq.rf == -1) {
      # no information available for the order
      warning("\nParameters not estimated.\n\n")
    } else {
      # convert numerical linkage phases to strings
      link.phases <- matrix(NA,length(input$seq.num),2)
      link.phases[1,] <- rep(1,2)
      for (i in 1:length(input$seq.phases)) {
        switch(EXPR=input$seq.phases[i],
               link.phases[i+1,] <- link.phases[i,]*c(1,1),
               link.phases[i+1,] <- link.phases[i,]*c(1,-1),
               link.phases[i+1,] <- link.phases[i,]*c(-1,1),
               link.phases[i+1,] <- link.phases[i,]*c(-1,-1),
        )
      }
      
      ## display results
      marnumbers <- input$seq.num
      distances <- c(0,cumsum(get(get(".map.fun", envir=.onemapEnv))(input$seq.rf)))
      ## whith diplotypes for class 'outcross'
      if(inherits(input$data.name, c("outcross", "f2"))){
        ## create diplotypes from segregation types and linkage phases
        link.phases <- apply(link.phases,1,function(x) paste(as.character(x),collapse="."))
        parents <- matrix("",length(input$seq.num),4)
        for (i in 1:length(input$seq.num))
          parents[i,] <- return_geno(input$data.name$segr.type[input$seq.num[i]],link.phases[i])
        out_dat_temp <- data.frame(group= group_names[z], mk.number = marnumbers, mk.names = marnames, dist = as.numeric(distances), 
                                   P1_1 = parents[,1],
                                   P1_2 = parents[,2],
                                   P2_1 = parents[,3],
                                   P2_2 = parents[,4])
        out_dat <- rbind(out_dat, out_dat_temp)
      }
      ## whithout diplotypes for other classes
      else if(inherits(input$data.name, c("backcross", "riself", "risib"))){
        warning("There is only a possible phase for this cross type\n")
      } else warning("invalid cross type")
    }
  }
  return(out_dat)
}


#' Generate data.frame with genotypes estimated by HMM and its probabilities
#'
#' @param ... Map(s) or list(s) of maps. Object(s) of class sequence.
#' @param ind vector with individual index to be evaluated or "all" to include all individuals
#' @param most_likely logical; if  \code{TRUE}, the most likely genotype receive 1 and all the rest 0. 
#' If there are more than one most likely both receive 0.5.
#' if FALSE (default) the genotype probability is plotted.
#' @param group_names Names of the groups.
#' 
#' @return a data.frame information: individual (ind) and marker ID, group ID (grp), position in centimorgan (pos), 
#' genotypes probabilities (prob), parents, and the parents homologs and the allele IDs. 
#' 
#' @examples 
#' \donttest{
#' data("onemap_example_out")
#' twopts <- rf_2pts(onemap_example_out)
#' lg1 <- make_seq(twopts, 1:5)
#' lg1.map <- map(lg1)
#' progeny_haplotypes(lg1.map)
#' }
#' @import dplyr
#' @import tidyr
#' 
#' @author Getulio Caixeta Ferreira, \email{getulio.caifer@@gmail.com}
#' @author Cristiane Taniguti, \email{chtaniguti@@tamu.edu}
#' @export
progeny_haplotypes <- function(...,
                               ind = 1, 
                               group_names=NULL,
                               most_likely = FALSE){
  #input map
  input <- list(...)
  if(length(input) == 0) stop("argument '...' missing, with no default")
  # Accept list of sequences or list of list of sequences
  if(inherits(input[[1]], "sequence")) input.map <- input else input.map <- unlist(input, recursive = FALSE)
  if(!all(sapply(input.map, function(x) inherits(x, "sequence")))) stop(paste("Input objects must be of 'sequence' class. \n"))
  if(is.null(group_names)) group_names <- paste("Group",seq(input.map), sep = " - ")
  n.mar <- sapply(input.map, function(x) length(x$seq.num))
  n.ind <- sapply(input.map, function(x) ncol(x$probs))/n.mar
  ind.names <- lapply(input.map, function(x) rownames(x$data.name$geno))
  ind.names <- unique(unlist(ind.names)) 
  if(length(unique(n.ind)) != 1) stop("At least one of the sequences have different number of individuals in dataset.")
  n.ind <- unique(n.ind)
  if(is.null(ind.names)) ind.names <- 1:n.ind
  if(ind[1] == "all"){
    ind <- 1:n.ind
  } 
  
  probs <- lapply(1:length(input.map), function(x) cbind(ind = rep(1:n.ind, each = n.mar[x]),
                                                         grp = group_names[x],
                                                         marker = input.map[[x]]$seq.num,
                                                         pos = c(0,cumsum(kosambi(input.map[[x]]$seq.rf))),
                                                         as.data.frame(t(input.map[[x]]$probs))))
  probs <- lapply(probs, function(x) split.data.frame(x, x$ind)[ind])
  
  if(inherits(input.map[[1]]$data.name, "outcross") | inherits(input.map[[1]]$data.name, "f2")){
    phase <- list('1' = c(1,2,3,4),
                  '2' = c(2,1,4,3),
                  '3' = c(3,4,1,2),
                  "4" = c(4,3,2,1))
    
    seq.phase <- lapply(input.map, function(x) c(1,x$seq.phases))
    
    # Adjusting phases
    for(g in seq_along(input.map)){
      for(i in seq_along(ind)){
        for(m in seq(n.mar[g])){
          probs[[g]][[i]][m:n.mar[g],5:8] <- probs[[g]][[i]][m:n.mar[g],phase[[seq.phase[[g]][m]]]+4]
        }
      }
    }
    
    # If there are two most probably genotypes both will receive 0.5
    probs <- do.call(rbind,lapply(probs, function(x) do.call(rbind, x)))
    
    if(most_likely){
      probs[,5:8] <- t(apply(probs[,5:8], 1, function(x) as.numeric(x == max(x))/sum(x == max(x))))
    }
    
    # When P1_H1 it means the allele in homolog 1 of the parent 1
    # when H1_P1 is the parent 1 allele in the progeny homolog 1
    if(inherits(input.map[[1]]$data.name, "outcross")){
      probs <- probs %>%
        mutate(P1_H1 = V1 + V2,
               P1_H2 = V3 + V4,
               P2_H1 = V1 + V3,
               P2_H2 = V2 + V4)  %>%
        select(ind, marker, grp, pos, P1_H1, P1_H2, P2_H1, P2_H2) %>% 
        gather(parents, prob, P1_H1, P1_H2, P2_H1, P2_H2) 
      
      new.col <- t(sapply(strsplit(probs$parents, "_"), "[", 1:2))
      colnames(new.col) <- c("parents", "parents.homologs")
      cross <- "outcross"
    } else {
      probs <- probs %>%
        mutate(H1_P1 = V1 + V2,
               H1_P2 = V3 + V4,
               H2_P1 = V1 + V3,
               H2_P2 = V2 + V4)  %>%
        select(ind, marker, grp, pos, H1_P1, H1_P2, H2_P1, H2_P2) %>% 
        gather(parents, prob, H1_P1, H1_P2, H2_P1, H2_P2) 
      
      new.col <- t(sapply(strsplit(probs$parents, "_"), "[", 1:2))
      colnames(new.col) <- c("progeny.homologs", "parents")
      cross <- "f2"
    }
  } else {
    probs <- do.call(rbind,lapply(probs, function(x) do.call(rbind, x)))
    if(most_likely){
      probs[,5:6] <- t(apply(probs[,5:6], 1, function(x) as.numeric(x == max(x))/sum(x == max(x))))
    }
    
    if (inherits(input.map[[1]]$data.name, c("backcross")) | inherits(input.map[[1]]$data.name, c("riself", "risib"))){
      if(inherits(input.map[[1]]$data.name, c("backcross"))){
        cross <- "backcross"
        probs <- probs %>% 
          mutate(H1_P1 = V1 + V2, # homozygote parent
                 H1_P2 = 0,
                 H2_P1 = V2, 
                 H2_P2 = V1) 
      } else if (inherits(input.map[[1]]$data.name, c("riself", "risib"))){
        cross <- "rils"
        probs <- probs %>%
          mutate(H1_P1 = V1,
                 H1_P2 = V2,
                 H2_P1 = V1,
                 H2_P2 = V2)
      }
      probs <- probs %>%  select(ind, marker, grp, pos, H1_P1, H1_P2, H2_P1, H2_P2) %>% 
        gather(parents, prob, H1_P1, H1_P2, H2_P1, H2_P2) 
      new.col <- t(sapply(strsplit(probs$parents, "_"), "[", 1:2))
      colnames(new.col) <- c("progeny.homologs", "parents")
    }
  }
  
  probs <- cbind(probs, new.col)
  probs <- cbind(probs[,-5], allele = probs[,5])
  probs <- as.data.frame(probs)
  probs$marker = colnames(input.map[[1]]$data.name$geno)[probs$marker]
  probs$ind <- ind.names[probs$ind]
  
  if(most_likely) flag <- "most.likely" else flag <- "by.probs"
  
  class(probs) <- c("onemap_progeny_haplotypes", cross, "data.frame", flag)
  return(probs)
}

##' Plots progeny haplotypes
##' 
##' Figure is generated with the haplotypes for each selected individual. As a representation, the recombination breakpoints are here considered 
##' to be in the mean point of the distance between two markers.  It is important to highlight that it did not reflects the exact breakpoint position, 
##' specially if the genetic map have low resolution. 
##' 
##' @param x object of class onemap_progeny_haplotypes
##' @param col Color of parents' homologous.
##' @param position "split" or "stack"; if "split" (default) the alleles' are plotted separately. if "stack" the parents' alleles are plotted together.
##' @param show_markers logical; if  \code{TRUE}, the markers (default) are plotted.
##' @param main An overall title for the plot; default is \code{NULL}.
##' @param ncol number of columns of the facet_wrap
##' @param ... currently ignored
##' 
##' @method plot onemap_progeny_haplotypes
##' 
##' @examples 
##'  \donttest{
#' data("onemap_example_out")
#' twopts <- rf_2pts(onemap_example_out)
#' lg1 <- make_seq(twopts, 1:5)
#' lg1.map <- map(lg1)
#' plot(progeny_haplotypes(lg1.map))
##' }
##' 
##' @return a ggplot graphic
##' 
##' @import ggplot2
#' @import dplyr
#' @import tidyr
#' 
##' @author Getulio Caixeta Ferreira, \email{getulio.caifer@@gmail.com}
##' @author Cristiane Taniguti, \email{chtaniguti@@tamu.edu}
##' 
##' @export
plot.onemap_progeny_haplotypes <- function(x,
                                           col = NULL, 
                                           position = "stack",
                                           show_markers = TRUE, 
                                           main = "Genotypes", ncol=4, ...){
  
  if(inherits(x, "outcross")){
    n <- c("H1" = "P1", "H2" = "P2")
    progeny.homologs <- names(n)[match(x$parents, n)]
    probs <- cbind(x, progeny.homologs)
  } else {
    probs <- x
  }
  
  probs <- probs %>% group_by(ind, grp, allele) %>%
    do(rbind(.,.[nrow(.),])) %>%
    do(mutate(.,
              pos2 = c(0,pos[-1]-diff(pos)/2), # Because we don't know exactly where 
              # the recombination occurs, we ilustrate it in the mean point between 
              # markers
              pos = c(pos[-nrow(.)], NA)))
  
  if(inherits(x, "outcross")){
    p <- ggplot(probs, aes(x = pos, col=allele, alpha = prob)) + ggtitle(main) 
  } else {
    p <- ggplot(probs, aes(x = pos, col=parents, alpha = prob)) + ggtitle(main) 
  }
  
  p <- p +  facet_wrap(~ ind + grp , ncol = ncol) +
    scale_alpha_continuous(range = c(0,1)) +
    guides(fill = guide_legend(reverse = TRUE)) +
    labs(alpha = "Prob", col = "Allele", x = "position (cM)")
  
  if(is.null(col)) p <- p + scale_color_brewer(palette="Set1")
  else p <- p + scale_color_manual(values = rev(col))
  
  if(position == "stack"){
    p <- p + geom_line(aes(x = pos2, y = progeny.homologs), size = ifelse(show_markers, 4, 5)) + labs(y = "progeny homologs")
    if(show_markers) p <- p + geom_point(aes(y = progeny.homologs), size = 5, stroke = 2, na.rm = T, shape = "|")
  } 
  if(position == "split"){
    p <- p + geom_line(aes(x = pos2, y = allele), size = ifelse(show_markers, 4, 5))
    if(show_markers) p <- p + geom_point(aes(y = allele), size = 5, stroke = 2, na.rm = T, shape = "|")
  } 
  return(p)
}

#' Plot number of breakpoints by individuals
#' 
#' Generate graphic with the number of break points for each individual 
#' considering the most likely genotypes estimated by the HMM.
#' Genotypes with same probability for two genotypes are removed.
#'  By now, only available for outcrossing and f2 intercross. 
#' 
#' @param x object of class onemap_progeny_haplotypes
#' 
#' @examples 
#' \donttest{
#' data("onemap_example_out")
#' twopts <- rf_2pts(onemap_example_out)
#' lg1 <- make_seq(twopts, 1:5)
#' lg1.map <- map(lg1)
#' progeny_haplotypes_counts(progeny_haplotypes(lg1.map, most_likely = TRUE))
#' }
#' 
#' @return a \code{data.frame} with columns individuals ID (ind), group ID (grp),
#' homolog (homolog) and counts of breakpoints
#' 
#' @import dplyr
#' @import tidyr
#'@export
progeny_haplotypes_counts <- function(x){
  if(!inherits(x, "onemap_progeny_haplotypes")) stop("Input need is not of class onemap_progeny_haplotyes")
  if(!inherits(x, "most.likely")) stop("The most likely genotypes must receive maximum probability (1)")
  cross <- class(x)[2]
  
  # Some genotypes receives prob of 0.5, here we need to make a decision about them
  # Here we keep the genotype of the marker before it
  doubt <- x[which(x$prob == 0.5),]
  if(dim(doubt)[1] > 0){
    if(any(which(x$prob == 0.5)) == 1){
      x[1, "prob"] <- x[2, "prob"]
    }
    repl <- x[which(x$prob == 0.5)-1,"prob"]
    x[which(x$prob == 0.5),"prob"] <- repl
  }
  
  x <- x[which(x$prob == 1),]
  x <- x[order(x$ind, x$grp, x$prob, x$parents,x$pos),]
  
  if(inherits(x, "outcross")){
    counts <- x %>% group_by(ind, grp) %>% 
      mutate(seq = sequence(rle(as.character(allele))$length) == 1)  %>%
      group_by(ind, grp, parents) %>%
      summarise(counts = sum(seq) -1,.groups = "keep") %>% ungroup()
    counts$parents <- gsub("P", "H", counts$parents)
    
  } else {
    counts <- x %>% group_by(ind, grp, progeny.homologs) %>%
      mutate(seq = sequence(rle(as.character(parents))$length) == 1) %>%
      summarise(counts = sum(seq) -1) %>% ungroup()
  }
  colnames(counts)[3] <- "homolog"
  class(counts) <- c("onemap_progeny_haplotypes_counts", cross, "data.frame")
  return(counts)
}


globalVariables(c("counts", "colorRampPalette", "alleles"))


##' Plot recombination breakpoints counts for each individual
##'
##' @param x object of class onemap_progeny_haplotypes_counts
##' @param by_homolog logical, if TRUE plots counts by homolog (two for each individuals), if FALSE plots total counts by individual
##' @param n.graphics integer defining the number of graphics to be plotted, they separate the individuals in different plots 
##' @param ncol integer defining the number of columns in plot
##' @param ... currently ignored
##' 
##' @return a ggplot graphic
##' 
##' @examples 
##' \donttest{
#' data("onemap_example_out")
#' twopts <- rf_2pts(onemap_example_out)
#' lg1 <- make_seq(twopts, 1:5)
#' lg1.map <- map(lg1)
#' prog.haplo <- progeny_haplotypes(lg1.map, most_likely = TRUE)
#' plot(progeny_haplotypes_counts(prog.haplo))
##' }
##' 
##' @method plot onemap_progeny_haplotypes_counts
##' @import ggplot2
##' @importFrom ggpubr ggarrange
##' @import dplyr
##' @import tidyr
##' @importFrom RColorBrewer brewer.pal
##' @importFrom grDevices colorRamp colorRampPalette
##' 
##' @export
plot.onemap_progeny_haplotypes_counts <- function(x, 
                                                  by_homolog = FALSE, 
                                                  n.graphics =NULL, 
                                                  ncol=NULL, ...){
  if(!inherits(x, "onemap_progeny_haplotypes_counts")) stop("Input need is not of class onemap_progeny_haplotyes_counts")

  p <- list()
  n.ind <- length(unique(x$ind))
  nb.cols <- n.ind
  mycolors <- colorRampPalette(brewer.pal(12, "Paired"))(nb.cols)
  set.seed(20)
  mycolors <- sample(mycolors)
  
  if(by_homolog){ 
    if(is.null(n.graphics) & is.null(ncol)){
      n.ind <- dim(x)[1]/2
      if(n.ind/25 <= 1) {
        n.graphics = 1
        ncol=1 
      }else { n.graphics = round(n.ind/25,0)
      ncol=round(n.ind/25,0)
      }
    }
    size <- dim(x)[1]
    if(size%%n.graphics == 0){
      div.n.graphics <- rep(1:n.graphics, each= size/n.graphics) 
    } else {           
      div.n.graphics <- c(rep(1:n.graphics, each = round(size/n.graphics,0)), rep(n.graphics, size%%n.graphics))
    }
    
    y_lim_counts <- max(x$counts)
    div.n.graphics <- div.n.graphics[1:size]
    p <- x %>% mutate(div.n.graphics = div.n.graphics) %>%
      split(., .$div.n.graphics) %>%
      lapply(., function(x) ggplot(x, aes(x=homolog, y=counts)) +
               geom_bar(stat="identity", aes(fill=grp)) + theme_minimal() + 
               coord_flip() + 
               scale_fill_manual(values=mycolors) +
               facet_grid(ind ~ ., switch = "y") +
               theme(axis.title.y = element_blank(),
                     axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                     strip.text.y.left = element_text(angle = 0)) + 
               labs(fill="groups") +
               ylim(0,y_lim_counts)
      )      
  } else {
    x <- x %>% ungroup %>% group_by(ind, grp) %>%
      summarise(counts = sum(counts))
    
    
    if(is.null(n.graphics) & is.null(ncol)){
      if(n.ind/25 <= 1) {
        n.graphics = 1
        ncol=1 
      }else { 
        n.graphics = round(n.ind/25,0)
        ncol=round(n.ind/25,0)
      }
    }
    
    size <-n.ind
    if(size%%n.graphics == 0){
      div.n.graphics <- rep(1:n.graphics, each= size/n.graphics) 
    } else {           
      div.n.graphics <-   c(rep(1:n.graphics, each = round(size/n.graphics,0)),rep(n.graphics, size%%n.graphics))
    }
    div.n.graphics <- div.n.graphics[1:n.ind]
    div.n.graphics <- rep(div.n.graphics, each = length(unique(x$grp)))
    
    x$ind <- factor(as.character(x$ind), levels = sort(as.character(unique(x$ind))))
    
    temp <- x %>% ungroup() %>% group_by(ind) %>%
      summarise(total = sum(counts))
    
    y_lim_counts <- max(temp$total)
    p <- x %>% ungroup() %>%  mutate(div.n.graphics = div.n.graphics) %>%
      split(., .$div.n.graphics) %>%
      lapply(., function(x) ggplot(x, aes(x=ind, y=counts, fill=grp)) +
               geom_bar(stat="identity") + coord_flip() + 
               scale_fill_manual(values=mycolors) +
               theme(axis.title.y = element_blank(), 
                     axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
               labs(fill="groups") +
               ylim(0,y_lim_counts)
      ) 
  }
  p <- ggarrange(plotlist = p, common.legend = T, label.x = 1, ncol = ncol, nrow = round(n.graphics/ncol,0))
  return(p)
}

Try the onemap package in your browser

Any scripts or data that you put into this service are public.

onemap documentation built on Nov. 26, 2022, 9:05 a.m.