R/miss_plot_heatmap.R

Defines functions miss_plot_heatmap

Documented in miss_plot_heatmap

#' Plot a heatmap of missing information
#'
#' Use to visualise the presence or absence of genetic information
#' for all sample and locus combinations. Can also be subset
#' by population.
#'
#' @param dat Data table: Contains genetic information and must have
#' the following columns,
#' \enumerate{
#'   \item The sampled individuals (see param \code{sampCol}).
#'   \item The chromosome/contig ID (see param \code{chromCol}).
#'   \item The position ID (see param \code{posCol}).
#'   \item The genotype column, e.g. a genotype of allele frequency,
#'   (see param \code{genoCol}).
#' }
#'
#' @param sortLoci Character: Sort the loci by their genomic order (\code{'order'}),
#' or by the proportion of missing data (\code{'missing'}). Default = \code{'order'}.
#'
#' @param sortSamp Character: Sort the samples by their alpha-numeric order (\code{'order'}),
#' or by the proportion of missing data (\code{'missing'}). Default = \code{'order'}.
#'
#' @param sampCol Character: The column name with the sampled
#' individual ID. Default = \code{'SAMPLE'}.
#'
#' @param chromCol Character: The column name with the chromosome/contig ID.
#' Default = \code{'CHROM'}.
#'
#' @param posCol Character: The column name with the position ID. Default = \code{'POS'}.
#'
#' @param genoCol Character: The column name with the genotype info.
#' Default = \code{'GT'}. Missing data should be represeted by \code{NA}.
#'
#' @param popCol Character: The column name with the population ID.
#' Optional parameter. Default = \code{NA}.
#'
#' @param plotColours Character: Vector of colours to use in plotting with a
#' length of 2. The first colour is the missing colour, and the second colour
#' is the non-missing colour. Default = NULL.
#'
#' @param plotNCol Integer: The number of columns to arrange indiviudal
#' population plots into. Only takes effect when \code{popCol} is specified.
#' Default = 2.
#'
#' @return Returns a ggplot object.
#'
#' @examples
#' library(genomalicious)
#'
#' ####   MISSING GENOTYPE DATA   ####
#' data(data_Genos)
#' datGt <- data_Genos
#'
#' # Add missing values
#' datGt <- do.call(
#'  'rbind',
#'  # Split data table by sample, and iterate through samples, X
#'  split(datGt, by='POP') %>%
#'    lapply(., function(Dpop){
#'      pop <- Dpop$POP[1]
#'
#'      if(pop=='Pop1'){
#'        pr <- 0.1
#'      } else if(pop=='Pop2'){
#'        pr <- 0.2
#'      } else if(pop %in% c('Pop3','Pop4')){
#'        pr <- 0.05
#'      }
#'
#'      # Numbers and unique loci and samples
#'      num.loc <- Dpop$LOCUS %>% unique %>% length
#'      uniq.loc <- Dpop$LOCUS %>% unique
#'      num.samp <- Dpop$SAMPLE %>% unique %>% length
#'      uniq.samp <- Dpop$SAMPLE %>% unique
#'
#'      # Vector of missingness
#'      num.miss <- rbinom(n=num.samp, size=num.loc, prob=pr)
#'
#'      # Iterate through samples and add unique loci
#'      for(i in 1:num.samp){
#'        locs <- sample(uniq.loc, size=num.miss[i], replace=FALSE)
#'        Dpop[SAMPLE==uniq.samp[i] & LOCUS%in%locs, GT:=NA]
#'      }
#'
#'      # Return
#'      return(Dpop)
#'    }
#'    )
#' )
#'
#' head(datGt, 10)
#'
#' # Heatmaps, default and specified colours
#' miss_plot_heatmap(datGt)
#' miss_plot_heatmap(datGt, plotColours=c('black', 'deeppink2'))
#'
#' # Heatmaps, by population
#' miss_plot_heatmap(datGt, popCol='POP', plotNCol=4)
#'
#' ####   CATCH PLOT OUTPUT FOR LATER USE   ####
#' gg4pops <- miss_plot_heatmap(datGt, popCol='POP')
#' plot(gg4pops)
#'
#' @export
miss_plot_heatmap <- function(
    dat, sortLoci='order', sortSamp='order',
    chromCol='CHROM', posCol='POS', sampCol='SAMPLE',
    genoCol='GT', popCol=NA, plotColours=NULL, plotNCol=2){

  # --------------------------------------------+
  # Libraries, assertions, and setup
  # --------------------------------------------+
  for(lib in c('ggplot2', 'data.table','gridExtra')){ require(lib, character.only = TRUE)}

  # Rename columns
  colnames(dat)[
    match(c(chromCol, posCol, genoCol, sampCol), colnames(dat))
  ] <- c('CHROM', 'POS', 'GT', 'SAMPLE')

  # Create a list of loci
  uniq.loci <- dat[, c('CHROM','POS')] %>%
    unique %>%
    setorder(., CHROM, POS) %>%
    .[, paste0(CHROM, '_', POS)]

  dat[, LOCUS:=paste0(CHROM, '_', POS)]

  # Create a list of unique samples
  uniq.samp <- dat$SAMPLE %>% unique

  # Rename the population column, if it was specified
  if(is.na(popCol)==FALSE){
    colnames(dat)[which(colnames(dat)==popCol)] <- 'POP'
  }

  # Check that sortLoci and sortSamp are correctly specified.
  check.sort.loci <- sortLoci %in% c('order','missing')
  if(check.sort.loci!=TRUE){
    stop('Argument `sortLoci` must be one of "order" and "missing". See ?miss_plot_heatmap.')
  }

  check.sort.samp <- sortSamp %in% c('order','missing')
  if(check.sort.samp!=TRUE){
    stop('Argument `sortSamp` must be one of "order" and "missing". See ?miss_plot_heatmap.')
  }

  # --------------------------------------------+
  # Code
  # --------------------------------------------+

  # Make two colours, if only one specified
  if(length(plotColours)<2){
    plotColours <- c('white', 'royalblue')
  } else if(length(plotColours)>2){
    plotColours <- plotColours[1:2]
  }

  # Assign a new column to record missing data
  dat[, MISS:=as.integer(is.na(GT))]

  # Sort
  if(sortLoci=='order'){
    dat[, LOCUS:=factor(LOCUS, levels=uniq.loci)]
  } else if(sortLoci=='missing'){
    miss.levels.loci <- dat[, sum(MISS)/length(LOCUS), by=LOCUS] %>%
      setorder(., V1) %>%
      .[['LOCUS']]
    dat[, LOCUS:=factor(LOCUS, levels=miss.levels.loci)]
  }

  if(sortSamp=='order'){
    dat[, SAMPLE:=factor(SAMPLE, levels=uniq.samp)]
  } else if(sortSamp=='missing'){
    miss.level.samp <- dat[, sum(MISS)/length(SAMPLE), by=SAMPLE] %>%
      setorder(., V1) %>%
      .[['SAMPLE']]
    dat[, SAMPLE:=factor(SAMPLE, levels=miss.level.samp)]
  }

  # Plot
  if(is.na(popCol)){
    # If no population column is specified (NA)
    gg <- (ggplot(dat, aes(x=SAMPLE, y=LOCUS))
           + geom_tile(aes(fill=as.factor(MISS)), colour=NA)
           + scale_fill_manual(values=c('0'=plotColours[1], '1'=plotColours[2]))
           + labs(x='Samples', y='Locus')
           + theme(
             axis.text.x = element_blank()
             ,axis.text.y = element_blank()
             ,axis.ticks = element_blank()
             ,panel.border=element_rect(fill=NA, colour='black')
             ,panel.background=element_blank()
             ,panel.grid=element_blank()
             ,legend.position='none'
           )
    )
  } else {
    # If population column is specified, make indiviudal plot
    # for each population
    gg <- (ggplot(dat, aes(x=SAMPLE, y=LOCUS))
           + geom_tile(aes(fill=as.factor(MISS)), colour=NA)
           + scale_fill_manual(values=c('1'=plotColours[1], '0'=plotColours[2]))
           + labs(x='Samples', y='Locus')
           + facet_wrap(~POP, scales='free_x', ncol=plotNCol)
           + theme(
             axis.text.x = element_blank()
             ,axis.text.y = element_blank()
             ,axis.ticks = element_blank()
             ,legend.position='none'
             ,strip.text.x=element_text(face='bold')
           )
    )
  }

  # Finish up
  return(gg)
}
j-a-thia/genomalicious documentation built on Oct. 19, 2024, 7:51 p.m.