#' Plot genome-wide statistics
#'
#'This function will plot the results from genome-wide analyses of diversity,
#'association, etc., that can be generated by a small variety of programs. As
#'of 2019-01, it will handle any tibbles produced by the file_loading_functions.
#'In reality, any tibble containing columns with the names "scaf" (scaffold),
#'"ps" (position on scaffold), "stat" (statistic for plotting), and "chr"
#'(chromosome the scaffold is assigned to, usually a dummy on first load) can be
#'used as input.
#'
#' @param input A tibble with four named columns (scaf, ps, stat, and chr)
#' @param type Type of input: gwas, popgen. gwas will -log10 the stat, while
#' popgen will estimate the ylimits.
#' @param scaffold_lengths Name of a two-column tab delimited file containing
#' scaffold and length.
#' @param plotting_column Name of the column to plot. Usually will be 'stat'
#' @param col1 Color of odd chromosomes
#' @param col2 Color of even chromosomes
#' @inheritParams get_cumulative_positions
#' @return A ggplot
#'
#' @export
#'
#' @examples
#' a1 <- system.file("extdata", "test.gemma_gwas.txt.gz",
#' package = "gwplotting")
#' a2 <- system.file("extdata", "test.chromSizes.txt.gz",
#' package="gwplotting")
#'
#' b <- load_gemma_gwas( a1, pval = 'p_wald' )
#'
#' c <- plot_genomewide_data( b, type = 'gwas', scaffold_lengths = a2,
#' plotting_column = 'stat', col1 = 'black',
#' col2 = 'grey' )
#' c
plot_genomewide_data <- function( input, type = 'gwas', scaffold_lengths,
plotting_column = 'stat', col1 = 'grey',
col2 = 'black'){
# Take care of GWAS ----------------------------------------------------------
if( type == 'gwas' ){
message("--Getting FDR values...\n")
pvals <- input$stat[ ! is.na( input$stat ) ]
pvals.adj <- p.adjust( pvals, method = 'fdr' )
x <- cbind( pvals, pvals.adj )
x <- x[ order( x[ , 'pvals.adj'], decreasing = T ), ]
p10 <- -log10(head( x[ x[ , 'pvals.adj' ] <= 0.1, ], n = 1 )[1])
p05 <- -log10(head( x[ x[ , 'pvals.adj' ] <= 0.05, ], n = 1 )[1])
p01 <- -log10(head( x[ x[ , 'pvals.adj' ] <= 0.01, ], n = 1 )[1])
cat( paste0("--10%, 5%, and 1% cutoffs are ",p10,", ",p05,", and ",p01,"\n") )
# Significance lines
sigLines <- list(ggplot2::geom_hline( ggplot2::aes( yintercept = p10 ),
color = 'blue', alpha = 0.50,
linetype = 'dashed'),
ggplot2::geom_hline( ggplot2::aes( yintercept = p01 ),
color = 'red', alpha = 0.50,
linetype = 'dashed' ),
ylab( bquote( -log[10]( italic(p)))))
# Reduce gwas and plot size
input <- filter( input, stat <= 0.05 )
input$stat <- -log10(input$stat)
mxs <- ceiling(max(input$stat, na.rm = T ) / 5) * 5
scaley <- list(ggplot2::coord_cartesian( ylim = c( 1.1, mxs ) ),
ggplot2::scale_y_continuous( expand = c( 0.05, 0.05 ),
limits = c( 1, mxs ),
breaks = c(1, seq( 5, mxs, by = 5 )),
labels = c( '', seq( 5, mxs, by = 5 ))))
} else {
sigLines <- NA
mns <- min( input[, plotting_column ], na.rm = T )
mxs <- max( input[, plotting_column ], na.rm = T )
scaley <- ggplot2::scale_y_continuous( expand = c( 0.05, 0.05 ),
limits = c( mns - 0.1*mns, mxs + 0.1*mxs) )
}
# Get cumulative positions ---------------------------------------------------
input <- get_cumulative_positions( input, scaffold_lengths = scaffold_lengths,
buffer = 500000, after = 'chromosomes' )
# Plotting -------------------------------------------------------------------
# Find tickmarks for x-axis
axisdf <- input %>% dplyr::group_by( chr ) %>%
dplyr::summarize( center = ( max(bp_cum) + min(bp_cum) ) / 2 )
num_chr <- nrow( axisdf )
# Guess if we're plotting scaffolds or chromosomes
if( num_chr > 50 ){
labs <- NULL
name <- 'Scaffolds'
} else {
labs <- axisdf$chr
name <- 'Chromosome'
}
# Generate plot
for_plot <- ggplot2::ggplot( input,
ggplot2::aes_string( x = "bp_cum" , y = plotting_column)) +
# Show all points
ggplot2::geom_point( ggplot2::aes( color = factor( input$chr, levels = unique( input$chr )) ),
alpha = 0.75 , size = 1 ) +
ggplot2::scale_color_manual( values = rep(c(col1, col2), ceiling( num_chr/2 ) ,
length = num_chr )) +
# custom X axis:
ggplot2::scale_x_continuous( breaks = axisdf$center, expand = c( 0.01, 0.01 ),
labels = labs, name = name ) +
# Add appropriate y scale
scaley +
# Customize the theme:
ggplot2::theme_bw() +
ggplot2::theme(
legend.position="none",
panel.border = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
panel.grid.minor.x = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.text = ggplot2::element_text( size = 6 ),
axis.title = ggplot2::element_text( size = 8 ))
if( is.list( sigLines ) ){
for_plot <- for_plot + sigLines
}
return( for_plot )
}
#' Plot statistics in a particular region
#'
#' This is a function to plot statistics on a particular scaffold or chromosome,
#' or in a particular region of that scaffold or chromosome. The input may be 1)
#' the raw tibble from any of the loading functions, 2) a reordered tibble, or
#' 3) a tibble with cumulative positions. This function will operate differently
#' on each of these input types. If the input already has cumulative positions
#' (i.e. column bp_cum), then the "from" and "to" variables are interpreted as
#' "from = bp_cum start" and "to = bp_cum end". If the input does not have
#' column bp_cum, then this function will generate bp_cum according to the
#' desired chromosome or scaffold, assume that the supplied "from" and "to"
#' arguments are scaffold positions, and determine the appropriate "from" and
#' "to" coordinates from the new bp_cum coordinates. You may only specify a
#' chromosome OR a scaffold.
#'
#' @param input A tibble with four named columns (scaf, ps, stat, and chr)
#' @param chromosome Chromosome to plot. This OR scaffold may be specified.
#' @param scaffold Scaffold to plot. This OR chromosome may be specified.
#' @param from Start position, if desired.
#' @param to End position, if desired
#' @param type Type of statistic: gwas, popgen.
#' @param plotting_column Name of the column to plot. Usually will be 'stat'
#' @param scaffold_lengths Path to a two-column tab delimited file with
#' scaffold and length.
#' @param color Color for points.
#' @param include_all_p Plot p-values <0.05?
#'
#' @return A ggplot.
#' @export
#'
#' @examples
#' a1 <- system.file("extdata", "test.gemma_gwas.txt.gz",
#' package = "gwplotting")
#' a2 <- system.file("extdata", "test.chromSizes.txt.gz",
#' package="gwplotting")
#'
#' b <- load_gemma_gwas( a1, pval = 'p_wald' )
#' chrom <- b$chr[ b$scaf == "scaffold_1000" ][1]
#'
#' c <- plot_region_data( b, chromosome = chrom, type = 'gwas',
#' plotting_column = 'stat', scaffold_lengths = a2)
#' c
plot_region_data <- function( input, chromosome = NA, scaffold = NA, from = 1,
to = NA, type = 'gwas', color = 'black',
plotting_column = 'stat', scaffold_lengths,
include_all_p = FALSE ){
# Check scaffold / chromosome ------------------------------------------------
if( ! is.na( scaffold ) & ! is.na( chromosome )){
message("ERROR: cannot specify both chromosome and scaffold")
stop()
} else if( is.na( scaffold ) & is.na( chromosome ) ){
message("ERROR: you must specify a chromosome or a scaffold")
stop()
} else if( ! is.na( scaffold ) ){
# Want a scaffold, so from and to should be in terms of ps
if( is.na( to ) ){ # Get just a small region
input <- dplyr::filter( input, scaf == scaffold )
} else {
input <- dplyr::filter( input, scaf == scaffold, ps >= from, ps <= to )
}
} else {
# Get chr of interest
input <- dplyr::filter( input, chr == chromosome )
# Handle from and to when bp_cum is not present
if( ! "bp_cum" %in% colnames( input ) ){
# If there's only one scaffold for this chromosome, obviously
# don't need to shift positions
if( length( unique( input$scaf ) ) == 1 ){
input <- mutate( input, bp_cum = ps )
} else {
input <- get_cumulative_positions( input, scaffold_lengths = scaffold_lengths,
buffer = 0, after = 'scaffolds')
}
}
# If you want a subregion,
if( ! is.na( to ) ){
input <- dplyr::filter( input, bp_cum >= from, bp_cum <= to )
}
}
# Take care of GWAS ----------------------------------------------------------
if( type == 'gwas' ){
# Reduce gwas and plot size
if( ! include_all_p ){
input <- filter( input, stat <= 0.05 )
}
input$stat <- -log10(input$stat)
ylimits <- c( 0, ceiling(max(input$stat, na.rm = T ) / 5) * 5 )
} else {
mns <- min( input[, plotting_column ], na.rm = T )
mxs <- max( input[, plotting_column ], na.rm = T )
ylimits <- c( mns - 0.1 * mns, mxs + 0.1 * mxs )
}
# Plotting -------------------------------------------------------------------
for_plot <- ggplot2::ggplot( input,
ggplot2::aes_string( x = "bp_cum" ,
y = plotting_column)) +
# Show all points
ggplot2::geom_point( alpha = 0.75 , size = 1, color = color ) +
# custom X axis:
ggplot2::scale_x_continuous( expand = c( 0.005, 0 ),
name = paste( "chromosome", chromosome, sep = ' ' ) ) +
ggplot2:: scale_y_continuous( expand = c( 0.05, 0 ), limits = ylimits ) +
# Customize the theme:
ggplot2::theme_bw() +
ggplot2::theme(
legend.position="none",
panel.border = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
panel.grid.minor.x = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.text = ggplot2::element_text( size = 6 ),
axis.title = ggplot2::element_text( size = 8 )
)
# Appropriate y lab
if( type == "gwas" ){
for_plot <- for_plot + ylab( bquote( -log[10](italic(p))))
}
return( for_plot )
}
#' Add geom_rect() objects to a ggplot.
#'
#'This function will calculate the coordinates of scaffold spans or gene models
#'for addition to a ggplot. It will use scaffold spans encoded in the input
#'tibble. GFF features of the specified type will be plotted if a GFF file is
#'specified.
#'
#'The input must only contain information from the region that you want to plot.
#'
#' @param input Tibble containing scaf, ps, stat, chr, and bp_cum ONLY for the
#' region you want to plot.
#' @param ymin Minimum y positions for geom_rects.
#' @param ymax Maximum y positions for geom_rects.
#' @param scaffold_lengths Path to a two-column tab-delimited file containing
#' scaffold names and their lengths.
#' @param gff Path to a GFF or BED format file, if plotting gene models.
#' @param feature GFF feature type to plot. Usually genes. Not extensively
#' tested.
#' @param mapping Path to reordering file.
#'
#' @return A four-column tibble containing an ID, xmins, xmaxs, ymins, and ymaxs
#' for use in a geom_rect() call.
#' @export
#'
#' @examples
#' a1 <- system.file("extdata", "test.gemma_gwas.txt.gz",
#' package = "gwplotting")
#' a2 <- system.file("extdata", "test.chromSizes.txt.gz",
#' package="gwplotting")
#'
#' # Load statistics, add cumulative positions (bp_cum)
#' b <- load_gemma_gwas( a1, pval = 'p_wald' )
#' c <- get_cumulative_positions( b, scaffold_lengths = a2 )
#'
#' # Get scaffold rects
#' scafs <- add_annotations( c, ymin = 0, ymax = 0.5 )
#'
#' scafs
#'
#' # Add rects to a plot of the region data
#' d <- plot_region_data( c, chromosome = 1, type = 'gwas',
#' plotting_column = 'stat', scaffold_lengths = a2) +
#' geom_rect( inherit.aes = F,
#' data = scafs,
#' mapping = aes( xmin = xmins,
#' xmax = xmaxs,
#' ymin = ymins,
#' ymax = ymaxs),
#' color = 'black',
#' alpha = 0.5,
#' size = 0.25 )
#'
#' d
add_annotations <- function( input, ymin, ymax, scaffold_lengths,
gff = NA, feature = NA, mapping = NA){
# Check to see if all params specified
if( ! 'bp_cum' %in% colnames(input) ){
stop("ERROR: You must first calculate cumulative positions for your input.\n")
} else if( is.na( ymin ) | is.na( ymax ) | is.na( scaffold_lengths ) ){
stop("ERROR: You must specify input, ymin, ymax, and scaffold_lengths.\n")
} else if( ! is.na( gff ) ){
if( is.na( feature ) & ( grepl( ".gff", gff ) | grepl( ".gff.gz", gff ) |
grepl( ".gff3", gff) | grepl( ".gff3.gz", gff)) ){
stop("ERROR: You must specify a feature type when plotting GFF features.\n")
} else if( ! file.exists( gff ) ){
stop("ERROR: GFF file doesn't exist.\n")
}
}
# What scaffolds are in this region
scafs_in_region <- unique( input$scaf )
# Handle scaffolds first. Doesn't matter if they're reordered because they're
# just based on length.
if( is.na( gff ) ){
# empty vectors
xmins <- vector( length = length(scafs_in_region) )
xmaxs <- xmins
# loop over scaffolds, get min/max bp_cum positions for each scaffold
for( i in 1:length( scafs_in_region )){
xmins[i] <- head( dplyr::filter( input, scaf == scafs_in_region[i] & !is.na( bp_cum ) ), n = 1)$bp_cum;
xmaxs[i] <- tail( dplyr::filter( input, scaf == scafs_in_region[i] & !is.na( bp_cum ) ), n = 1)$bp_cum;
}
ymins <- rep( ymin, length( xmins ) )
ymaxs <- rep( ymax, length( xmins ) )
ids <- scafs_in_region
} else {
# We're doing GFF or BED features. If there is a mapping file, then the input
# has already been reordered and we need to take into account strand
# information when determining x values.
# if it's a bed file, figure out what kind and keep necessary info
if( grepl( ".bed$", gff ) | grepl( ".bed.gz$", gff ) ){
g <- readr::read_tsv( gff, comment = "#", col_names = F )
if( ncol( g ) >= 6 ){
# bed12
g <- tidyr::select( g, X1, X2, X3, X4, X6 ) %>%
dplyr::filter( X1 %in% scafs_in_region )
colnames(g) <- c('chrom','start','end','id','strand')
} else if( ncol( g ) >= 4 ){
#>bed4
g <- tidyr::select( g, X1, X2, X3, X4 ) %>%
tibble::add_column( strand = rep( '+', nrow(g) ) ) %>%
dplyr::filter( X1 %in% scafs_in_region )
colnames(g) <- c('chrom','start','end','id','strand')
} else if( ncol(g) == 3 ){
#simple bed
colnames(g) <- c('chrom','start','end')
g <- tibble::add_column( id = 1:nrow(g), strand = rep( '+', nrow(g) )) %>%
dplyr::filter( chrom %in% scafs_in_region )
}
# otherwise, should be GFF
} else if( grepl( ".gff$", gff ) | grepl( ".gff.gz$", gff ) |
grepl( ".gff3$", gff ) | grepl( ".gff3.gz$", gff )){
gff_colnames <- c('chrom','source','type','start','end',
'score','strand','phase','attributes')
# open, filter,
g <- readr::read_tsv( file = gff, comment = "#",
col_names = gff_colnames ) %>%
dplyr::filter( type == feature & chrom %in% scafs_in_region & ! grepl("other",source) ) %>%
dplyr::select( chrom, start, end, strand, attributes ) %>%
dplyr::mutate( id = stringr::str_replace( attributes, "[;].*$", "") ) %>%
dplyr::mutate( id = stringr::str_replace( id, "ID=","")) %>%
dplyr::select( -attributes )
} else {
# not bed or gff
stop("ERROR: Expecting a .bed(.gz) or .gff(.gz) file for gff.\n")
}
# Hopefully there are some features in here
if( nrow(g) == 0 ){
stop("ERROR: No features in this region\n")
} else {
message("Plotting ", nrow(g)," features")
}
# There are some models on this/these scaffolds. Now, must get strand and
# convert coordinates.
# g = chrom, start, end, strand, id
# Get scaffold --> chromosome orientation
if( is.na( mapping ) ){
scaf_strands <- cbind( scafs_in_region, rep('+', length(scafs_in_region)))
colnames( scaf_strands ) <- c('scaf','strand')
} else {
scaf_strands <- readr::read_table2( mapping , col_names = T )
colnames(scaf_strands)[1] <- 'scaf'
scaf_strands <- dplyr::select( scaf_strands, scaf, strand ) %>%
dplyr::filter( scaf %in% scafs_in_region )
#scaf_lens is not ordered
scaf_lens <- readr::read_table2( scaffold_lengths,
col_names = c('scaf', 'length')) %>%
dplyr::filter( scaf %in% scafs_in_region )
}
# Now, shift each gff feature by bp_cum offset
message("Processing ", length(scafs_in_region)," scaffolds")
for( s in 1:length(scafs_in_region) ){
# input
# scaf_strands --> d.f(scaf, strand)
# g --> tibble(chrom, start, end, strand, id ) [ chrom == scaffold ]
# If there are gene models on the scaffold, then proceed
if( length( g$start[ g$chrom == scafs_in_region[s]] ) > 0 ){
# get the offset
offset <- input$bp_cum[ input$scaf == scafs_in_region[s] ][1] -
input$ps[ input$scaf == scafs_in_region[s]][1]
# nothing special if it's on the + strand
if( scaf_strands$strand[ scaf_strands$scaf == scafs_in_region[s] ] == '+' |
is.na( scaf_strands$strand[ scaf_strands$scaf == scafs_in_region[s]]) ){
g$start[ g$chrom == scafs_in_region[s] ] <-
g$start[ g$chrom == scafs_in_region[s] ] + offset
g$end[ g$chrom == scafs_in_region[s] ] <-
g$end[ g$chrom == scafs_in_region[s] ] + offset
} else {
# on the - strand, so (length - ps).
g$start[ g$chrom == scafs_in_region[s] ] <-
scaf_lens$length[ scaf_lens$scaf == scafs_in_region[s] ] -
g$start[ g$chrom == scafs_in_region[s] ] + offset
g$end[ g$chrom == scafs_in_region[s] ] <-
scaf_lens$length[ scaf_lens$scaf == scafs_in_region[s] ] -
g$end[ g$chrom == scafs_in_region[s] ] + offset
# switch strand for genes on those scaffolds that map in the
# - orientation relative to chromosomes
ori_strands <- g$strand[ g$chrom == scafs_in_region[s] ]
ori_strands <- replace( ori_strands, ori_strands == '+', 'P')
ori_strands <- replace( ori_strands, ori_strands == '-', 'M')
ori_strands <- replace( ori_strands, ori_strands == 'P', '-')
ori_strands <- replace( ori_strands, ori_strands == 'M', '+')
# replace strand
g$strand[ g$chrom == scafs_in_region[s] ] <- ori_strands
}
}
} # Then there are no models on this scaffold
# Keep only models within the plotting region
g <- dplyr::filter( g, start >= min( input$bp_cum ) &
start <= max( input$bp_cum ) &
end >= min( input$bp_cum ) &
end <= max( input$bp_cum) )
# get vectors of interest
xmins <- g$start
xmaxs <- g$end
ymins <- unlist(dplyr::mutate(g, ymins = ifelse( strand == '-', ymin,
ymin + 0.5*(ymax - ymin))) %>%
dplyr::select( ymins ))
ymaxs <- unlist(dplyr::mutate(g, ymaxs = ifelse( strand == '-', ymin + 0.5*(ymax - ymin),
ymax)) %>%
dplyr::select( ymaxs ))
ids <- g$id
} # END of if is.na(gff)
# construct tibble to return
fr <- tibble::tibble(ids, xmins, xmaxs, ymins, ymaxs )
return( fr )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.