R/plotting_functions.R

Defines functions add_annotations plot_region_data plot_genomewide_data

Documented in add_annotations plot_genomewide_data plot_region_data

#' Plot genome-wide statistics
#'
#'This function will plot the results from genome-wide analyses of diversity,
#'association, etc., that can be generated by a small variety of programs. As
#'of 2019-01, it will handle any tibbles produced by the file_loading_functions.
#'In reality, any tibble containing columns with the names "scaf" (scaffold),
#'"ps" (position on scaffold), "stat" (statistic for plotting), and "chr"
#'(chromosome the scaffold is assigned to, usually a dummy on first load) can be
#'used as input.
#'
#' @param input A tibble with four named columns (scaf, ps, stat, and chr)
#' @param type Type of input: gwas, popgen. gwas will -log10 the stat, while
#'     popgen will estimate the ylimits.
#' @param scaffold_lengths Name of a two-column tab delimited file containing
#'     scaffold and length.
#' @param plotting_column Name of the column to plot. Usually will be 'stat'
#' @param col1 Color of odd chromosomes
#' @param col2 Color of even chromosomes
#' @inheritParams get_cumulative_positions
#' @return  A ggplot
#'
#' @export
#'
#' @examples
#' a1 <- system.file("extdata", "test.gemma_gwas.txt.gz",
#'                  package = "gwplotting")
#' a2 <- system.file("extdata", "test.chromSizes.txt.gz",
#'                   package="gwplotting")
#'
#' b <- load_gemma_gwas( a1, pval = 'p_wald' )
#'
#' c <- plot_genomewide_data( b, type = 'gwas', scaffold_lengths = a2,
#'                            plotting_column = 'stat', col1 = 'black',
#'                            col2 = 'grey' )
#' c
plot_genomewide_data <- function( input, type = 'gwas', scaffold_lengths,
                                  plotting_column = 'stat', col1 = 'grey',
                                  col2 = 'black'){

  # Take care of GWAS ----------------------------------------------------------
  if( type == 'gwas' ){

    message("--Getting FDR values...\n")

    pvals <- input$stat[ ! is.na( input$stat ) ]
    pvals.adj <- p.adjust( pvals, method = 'fdr' )
    x <- cbind( pvals, pvals.adj )
    x <- x[ order( x[ , 'pvals.adj'], decreasing = T ), ]
    p10 <- -log10(head( x[ x[ , 'pvals.adj' ] <= 0.1, ], n = 1 )[1])
    p05 <- -log10(head( x[ x[ , 'pvals.adj' ] <= 0.05, ], n = 1 )[1])
    p01 <- -log10(head( x[ x[ , 'pvals.adj' ] <= 0.01, ], n = 1 )[1])

    cat( paste0("--10%, 5%, and 1% cutoffs are ",p10,", ",p05,", and ",p01,"\n") )

    # Significance lines
    sigLines <- list(ggplot2::geom_hline( ggplot2::aes( yintercept = p10 ),
                                          color = 'blue', alpha = 0.50,
                                          linetype = 'dashed'),
                     ggplot2::geom_hline( ggplot2::aes( yintercept = p01 ),
                                          color = 'red', alpha = 0.50,
                                          linetype = 'dashed' ),
                     ylab( bquote( -log[10]( italic(p)))))

    # Reduce gwas and plot size
    input <- filter( input, stat <= 0.05 )
    input$stat <- -log10(input$stat)

    mxs <- ceiling(max(input$stat, na.rm = T ) / 5) * 5

    scaley <- list(ggplot2::coord_cartesian( ylim = c( 1.1, mxs ) ),
                   ggplot2::scale_y_continuous( expand = c( 0.05, 0.05 ),
                                                limits = c( 1, mxs ),
                                                breaks = c(1, seq( 5, mxs, by = 5 )),
                                                labels = c( '', seq( 5, mxs, by = 5 ))))

  } else {

    sigLines <- NA

    mns <- min( input[, plotting_column ], na.rm = T )
    mxs <- max( input[, plotting_column ], na.rm = T )

    scaley <- ggplot2::scale_y_continuous( expand = c( 0.05, 0.05 ),
                                           limits = c( mns - 0.1*mns, mxs + 0.1*mxs) )

  }

  # Get cumulative positions ---------------------------------------------------
  input <- get_cumulative_positions( input, scaffold_lengths = scaffold_lengths,
                                     buffer = 500000, after = 'chromosomes' )

  # Plotting -------------------------------------------------------------------

  # Find tickmarks for x-axis
  axisdf <- input %>% dplyr::group_by( chr ) %>%
    dplyr::summarize( center = ( max(bp_cum) + min(bp_cum) ) / 2 )

  num_chr <- nrow( axisdf )

  # Guess if we're plotting scaffolds or chromosomes
  if( num_chr > 50 ){
    labs <- NULL
    name <- 'Scaffolds'
  } else {
    labs <- axisdf$chr
    name <- 'Chromosome'
  }

  # Generate plot
  for_plot <- ggplot2::ggplot( input,
                               ggplot2::aes_string( x = "bp_cum" , y = plotting_column)) +

    # Show all points
    ggplot2::geom_point( ggplot2::aes( color = factor( input$chr, levels = unique( input$chr )) ),
                         alpha = 0.75 , size = 1 ) +
    ggplot2::scale_color_manual( values = rep(c(col1, col2), ceiling( num_chr/2 ) ,
                                              length = num_chr )) +

    # custom X axis:
    ggplot2::scale_x_continuous( breaks = axisdf$center, expand = c( 0.01, 0.01 ),
                                 labels = labs, name = name ) +

    # Add appropriate y scale
    scaley +

    # Customize the theme:
    ggplot2::theme_bw() +
    ggplot2::theme(
      legend.position="none",
      panel.border = ggplot2::element_blank(),
      panel.grid.major.x = ggplot2::element_blank(),
      panel.grid.minor.x = ggplot2::element_blank(),
      axis.ticks = ggplot2::element_blank(),
      axis.text = ggplot2::element_text( size = 6 ),
      axis.title = ggplot2::element_text( size = 8 ))

  if( is.list( sigLines ) ){
    for_plot <- for_plot + sigLines
  }

  return( for_plot )

}


#' Plot statistics in a particular region
#'
#' This is a function to plot statistics on a particular scaffold or chromosome,
#' or in a particular region of that scaffold or chromosome. The input may be 1)
#' the raw tibble from any of the loading functions, 2) a reordered tibble, or
#' 3) a tibble with cumulative positions. This function will operate differently
#' on each of these input types. If the input already has cumulative positions
#' (i.e. column bp_cum), then the "from" and "to" variables are interpreted as
#' "from = bp_cum start" and "to = bp_cum end". If the input does not have
#' column bp_cum, then this function will generate bp_cum according to the
#' desired chromosome or scaffold, assume that the supplied "from" and "to"
#' arguments are scaffold positions, and determine the appropriate "from" and
#' "to" coordinates from the new bp_cum coordinates. You may only specify a
#' chromosome OR a scaffold.
#'
#' @param input A tibble with four named columns (scaf, ps, stat, and chr)
#' @param chromosome Chromosome to plot. This OR scaffold may be specified.
#' @param scaffold Scaffold to plot. This OR chromosome may be specified.
#' @param from Start position, if desired.
#' @param to End position, if desired
#' @param type Type of statistic: gwas, popgen.
#' @param plotting_column Name of the column to plot. Usually will be 'stat'
#' @param scaffold_lengths Path to a two-column tab delimited file with
#'     scaffold and length.
#' @param color Color for points.
#' @param include_all_p Plot p-values <0.05?
#'
#' @return A ggplot.
#' @export
#'
#' @examples
#' a1 <- system.file("extdata", "test.gemma_gwas.txt.gz",
#'                  package = "gwplotting")
#' a2 <- system.file("extdata", "test.chromSizes.txt.gz",
#'                   package="gwplotting")
#'
#' b <- load_gemma_gwas( a1, pval = 'p_wald' )
#' chrom <- b$chr[ b$scaf == "scaffold_1000" ][1]
#'
#' c <- plot_region_data( b, chromosome = chrom, type = 'gwas',
#' plotting_column = 'stat', scaffold_lengths = a2)
#' c
plot_region_data <- function( input, chromosome = NA, scaffold = NA, from = 1,
                              to = NA, type = 'gwas', color = 'black',
                              plotting_column = 'stat', scaffold_lengths,
                              include_all_p = FALSE ){

  # Check scaffold / chromosome ------------------------------------------------
  if( ! is.na( scaffold ) & ! is.na( chromosome )){
    message("ERROR: cannot specify both chromosome and scaffold")
    stop()
  } else if( is.na( scaffold ) & is.na( chromosome ) ){
    message("ERROR: you must specify a chromosome or a scaffold")
    stop()
  } else if( ! is.na( scaffold ) ){

    # Want a scaffold, so from and to should be in terms of ps
    if( is.na( to )  ){ # Get just a small region
      input <- dplyr::filter( input, scaf == scaffold )
    } else {
      input <- dplyr::filter( input, scaf == scaffold, ps >= from, ps <= to )
    }

  } else {

    # Get chr of interest
    input <- dplyr::filter( input, chr == chromosome )

    # Handle from and to when bp_cum is not present
    if( ! "bp_cum" %in% colnames( input ) ){

      # If there's only one scaffold for this chromosome, obviously
      # don't need to shift positions
      if( length( unique( input$scaf ) ) == 1 ){
        input <- mutate( input, bp_cum = ps )
      } else {
        input <- get_cumulative_positions( input, scaffold_lengths = scaffold_lengths,
                                           buffer = 0, after = 'scaffolds')
      }
    }

    # If you want a subregion,
    if( ! is.na( to ) ){
      input <- dplyr::filter( input, bp_cum >= from, bp_cum <= to )
    }
  }

  # Take care of GWAS ----------------------------------------------------------
  if( type == 'gwas' ){

    # Reduce gwas and plot size
    if( ! include_all_p ){
      input <- filter( input, stat <= 0.05 )
    }

    input$stat <- -log10(input$stat)
    ylimits <- c( 0, ceiling(max(input$stat, na.rm = T ) / 5) * 5 )

  } else {
    mns <- min( input[,  plotting_column ], na.rm = T )
    mxs <- max( input[, plotting_column ], na.rm = T )
    ylimits <- c( mns - 0.1 * mns, mxs + 0.1 * mxs )
  }


  # Plotting -------------------------------------------------------------------

  for_plot <- ggplot2::ggplot( input,
                               ggplot2::aes_string( x = "bp_cum" ,
                                                    y = plotting_column)) +

    # Show all points
    ggplot2::geom_point( alpha = 0.75 , size = 1, color = color ) +

    # custom X axis:
    ggplot2::scale_x_continuous( expand = c( 0.005, 0 ),
                                 name = paste( "chromosome", chromosome, sep = ' ' )  ) +
    ggplot2:: scale_y_continuous( expand = c( 0.05, 0 ), limits = ylimits ) +

    # Customize the theme:
    ggplot2::theme_bw() +
    ggplot2::theme(
      legend.position="none",
      panel.border = ggplot2::element_blank(),
      panel.grid.major.x = ggplot2::element_blank(),
      panel.grid.minor.x = ggplot2::element_blank(),
      axis.ticks = ggplot2::element_blank(),
      axis.text = ggplot2::element_text( size = 6 ),
      axis.title = ggplot2::element_text( size = 8 )
    )

  # Appropriate y lab
  if( type == "gwas" ){
    for_plot <- for_plot + ylab( bquote( -log[10](italic(p))))
  }

  return( for_plot )

}

#' Add geom_rect() objects to a ggplot.
#'
#'This function will calculate the coordinates of scaffold spans or gene models
#'for addition to a ggplot. It will use scaffold spans encoded in the input
#'tibble. GFF features of the specified type will be plotted if a GFF file is
#'specified.
#'
#'The input must only contain information from the region that you want to plot.
#'
#' @param input Tibble containing scaf, ps, stat, chr, and bp_cum ONLY for the
#'     region you want to plot.
#' @param ymin Minimum y positions for geom_rects.
#' @param ymax Maximum y positions for geom_rects.
#' @param scaffold_lengths Path to a two-column tab-delimited file containing
#'     scaffold names and their lengths.
#' @param gff Path to a GFF or BED format file, if plotting gene models.
#' @param feature GFF feature type to plot. Usually genes. Not extensively
#'     tested.
#' @param mapping Path to reordering file.
#'
#' @return A four-column tibble containing an ID, xmins, xmaxs, ymins, and ymaxs
#'     for use in a geom_rect() call.
#' @export
#'
#' @examples
#' a1 <- system.file("extdata", "test.gemma_gwas.txt.gz",
#'                  package = "gwplotting")
#' a2 <- system.file("extdata", "test.chromSizes.txt.gz",
#'                   package="gwplotting")
#'
#' # Load statistics, add cumulative positions (bp_cum)
#' b <- load_gemma_gwas( a1, pval = 'p_wald' )
#' c <- get_cumulative_positions( b, scaffold_lengths = a2 )
#'
#' # Get scaffold rects
#' scafs <- add_annotations( c, ymin = 0, ymax = 0.5 )
#'
#' scafs
#'
#' # Add rects to a plot of the region data
#' d <- plot_region_data( c, chromosome = 1, type = 'gwas',
#'                        plotting_column = 'stat', scaffold_lengths = a2) +
#'      geom_rect( inherit.aes = F,
#'                 data = scafs,
#'                 mapping = aes( xmin = xmins,
#'                                xmax = xmaxs,
#'                                ymin = ymins,
#'                                ymax = ymaxs),
#'                 color = 'black',
#'                 alpha = 0.5,
#'                 size = 0.25 )
#'
#' d
add_annotations <- function( input, ymin, ymax, scaffold_lengths,
                             gff = NA, feature = NA, mapping = NA){

  # Check to see if all params specified
  if( ! 'bp_cum' %in% colnames(input) ){
    stop("ERROR: You must first calculate cumulative positions for your input.\n")
  } else if( is.na( ymin ) | is.na( ymax ) | is.na( scaffold_lengths ) ){
    stop("ERROR: You must specify input, ymin, ymax, and scaffold_lengths.\n")
  } else if( ! is.na( gff ) ){
    if( is.na( feature ) & ( grepl( ".gff", gff ) | grepl( ".gff.gz", gff ) |
                             grepl( ".gff3", gff) | grepl( ".gff3.gz", gff)) ){
      stop("ERROR: You must specify a feature type when plotting GFF features.\n")
    } else if( ! file.exists( gff ) ){
      stop("ERROR: GFF file doesn't exist.\n")
    }
  }

  # What scaffolds are in this region
  scafs_in_region <- unique( input$scaf )

  # Handle scaffolds first. Doesn't matter if they're reordered because they're
  # just based on length.
  if( is.na( gff ) ){

    # empty vectors
    xmins <- vector( length = length(scafs_in_region) )
    xmaxs <- xmins

    # loop over scaffolds, get min/max bp_cum positions for each scaffold
    for( i in 1:length( scafs_in_region )){
      xmins[i] <- head( dplyr::filter( input, scaf == scafs_in_region[i] & !is.na( bp_cum ) ), n = 1)$bp_cum;
      xmaxs[i] <- tail( dplyr::filter( input, scaf == scafs_in_region[i] & !is.na( bp_cum ) ), n = 1)$bp_cum;
    }

    ymins <- rep( ymin, length( xmins ) )
    ymaxs <- rep( ymax, length( xmins ) )
    ids <- scafs_in_region

  } else {

    # We're doing GFF or BED features. If there is a mapping file, then the input
    # has already been reordered and we need to take into account strand
    # information when determining x values.

    # if it's a bed file, figure out what kind and keep necessary info
    if( grepl( ".bed$", gff ) | grepl( ".bed.gz$", gff ) ){

      g <- readr::read_tsv( gff, comment = "#", col_names = F )


      if( ncol( g ) >= 6 ){
        # bed12
        g <- tidyr::select( g, X1, X2, X3, X4, X6 ) %>%
          dplyr::filter( X1 %in% scafs_in_region )
        colnames(g) <- c('chrom','start','end','id','strand')
      } else if( ncol( g ) >= 4 ){
        #>bed4
        g <- tidyr::select( g, X1, X2, X3, X4 ) %>%
          tibble::add_column( strand = rep( '+', nrow(g) ) ) %>%
          dplyr::filter( X1 %in% scafs_in_region )
        colnames(g) <- c('chrom','start','end','id','strand')
      } else if( ncol(g) == 3 ){
        #simple bed
        colnames(g) <- c('chrom','start','end')
        g <- tibble::add_column( id = 1:nrow(g), strand = rep( '+', nrow(g) )) %>%
          dplyr::filter( chrom %in% scafs_in_region )
      }

      # otherwise, should be GFF
    } else if( grepl( ".gff$", gff ) |  grepl( ".gff.gz$", gff ) |
               grepl( ".gff3$", gff ) | grepl( ".gff3.gz$", gff )){

      gff_colnames <- c('chrom','source','type','start','end',
                        'score','strand','phase','attributes')

      # open, filter,
      g <- readr::read_tsv( file = gff, comment = "#",
                            col_names = gff_colnames  ) %>%
        dplyr::filter( type == feature & chrom %in% scafs_in_region & ! grepl("other",source) ) %>%
        dplyr::select( chrom, start, end, strand, attributes ) %>%
        dplyr::mutate( id = stringr::str_replace( attributes, "[;].*$", "") ) %>%
        dplyr::mutate( id = stringr::str_replace( id, "ID=","")) %>%
        dplyr::select( -attributes )

    } else {
      # not bed or gff
      stop("ERROR: Expecting a .bed(.gz) or .gff(.gz) file for gff.\n")
    }

    # Hopefully there are some features in here
    if( nrow(g) == 0 ){
      stop("ERROR: No features in this region\n")
    } else {
      message("Plotting ", nrow(g)," features")
    }

    # There are some models on this/these scaffolds. Now, must get strand and
    # convert coordinates.
    # g = chrom, start, end, strand, id

    # Get scaffold --> chromosome orientation
    if( is.na( mapping ) ){
      scaf_strands <- cbind( scafs_in_region, rep('+', length(scafs_in_region)))
      colnames( scaf_strands ) <- c('scaf','strand')
    } else {
      scaf_strands <- readr::read_table2( mapping , col_names = T  )
      colnames(scaf_strands)[1] <- 'scaf'

      scaf_strands <- dplyr::select( scaf_strands, scaf, strand ) %>%
        dplyr::filter( scaf %in% scafs_in_region )

      #scaf_lens is not ordered
      scaf_lens <- readr::read_table2( scaffold_lengths,
                                       col_names = c('scaf', 'length')) %>%
        dplyr::filter( scaf %in% scafs_in_region )

    }

    # Now, shift each gff feature by bp_cum offset
    message("Processing ", length(scafs_in_region)," scaffolds")

    for( s in 1:length(scafs_in_region) ){

      # input
      # scaf_strands --> d.f(scaf, strand)
      # g --> tibble(chrom, start, end, strand, id ) [ chrom == scaffold ]

      # If there are gene models on the scaffold, then proceed
      if( length( g$start[ g$chrom == scafs_in_region[s]] ) > 0 ){

        # get the offset
        offset <- input$bp_cum[ input$scaf == scafs_in_region[s] ][1] -
          input$ps[ input$scaf == scafs_in_region[s]][1]

        # nothing special if it's on the + strand
        if( scaf_strands$strand[ scaf_strands$scaf == scafs_in_region[s] ] == '+' |
            is.na( scaf_strands$strand[ scaf_strands$scaf == scafs_in_region[s]]) ){
          g$start[ g$chrom == scafs_in_region[s] ] <-
            g$start[ g$chrom == scafs_in_region[s] ] + offset

          g$end[ g$chrom == scafs_in_region[s] ] <-
            g$end[ g$chrom == scafs_in_region[s] ] + offset

        } else {

          # on the - strand, so (length - ps).
          g$start[ g$chrom == scafs_in_region[s] ] <-
            scaf_lens$length[ scaf_lens$scaf == scafs_in_region[s] ] -
            g$start[ g$chrom == scafs_in_region[s] ] + offset

          g$end[ g$chrom == scafs_in_region[s] ] <-
            scaf_lens$length[ scaf_lens$scaf == scafs_in_region[s] ] -
            g$end[ g$chrom == scafs_in_region[s] ] + offset

          # switch strand for genes on those scaffolds that map in the
          # - orientation relative to chromosomes
          ori_strands <- g$strand[ g$chrom == scafs_in_region[s] ]
          ori_strands <- replace( ori_strands, ori_strands == '+', 'P')
          ori_strands <- replace( ori_strands, ori_strands == '-', 'M')
          ori_strands <- replace( ori_strands, ori_strands == 'P', '-')
          ori_strands <- replace( ori_strands, ori_strands == 'M', '+')

          # replace strand
          g$strand[ g$chrom == scafs_in_region[s] ] <- ori_strands

        }
      }
    } # Then there are no models on this scaffold

    # Keep only models within the plotting region
    g <- dplyr::filter( g, start >= min( input$bp_cum ) &
                          start <= max( input$bp_cum ) &
                          end >= min( input$bp_cum ) &
                          end <= max( input$bp_cum) )
    # get vectors of interest
    xmins <- g$start
    xmaxs <- g$end
    ymins <- unlist(dplyr::mutate(g, ymins = ifelse( strand == '-', ymin,
                                                     ymin + 0.5*(ymax - ymin))) %>%
                      dplyr::select( ymins ))

    ymaxs <- unlist(dplyr::mutate(g, ymaxs = ifelse( strand == '-', ymin + 0.5*(ymax - ymin),
                                                     ymax))  %>%
                      dplyr::select( ymaxs ))

    ids <- g$id

  } # END of if is.na(gff)

  # construct tibble to return
  fr <- tibble::tibble(ids, xmins, xmaxs, ymins, ymaxs )

  return( fr )

}
nwvankuren/gwplotting documentation built on April 17, 2021, 2:37 p.m.