R/gg_locusplot.R

Defines functions gg_locusplot

Documented in gg_locusplot

# Function to plot regional association with LD
#' Create a regional association plot
#'
#' Returns a ggplot object containing a regional association plot (-log10(p-value) as a function of chromosomal position, with variants colored by linkage disequilibrium to reference variant).
#' This function allows the user to integrate genome wide association study (GWAS) summary statistics for a locus of interest with linkage disequilibrium information (obtained using the University of Michigan LocusZoom API <https://portaldev.sph.umich.edu/>) for that locus to create a regional association plot.
#'
#' @param df Dataframe containing columns with rsid, chromosome, position, reference/effect allele, alternate/non-effect allele, and p-value for all variants within the range of interest
#' @param lead_snp A character vector containing a lead variant of interest. When NULL (default), the variant with the lowest p-value will be selected as the lead variant.
#' @param rsid Rsid column
#' @param chrom Chromosome column
#' @param pos Position column
#' @param ref Reference/effect allele column
#' @param alt Alternate/non-effect allele column
#' @param effect Effect size column (on beta or log-odds scale)
#' @param std_err Standard error column
#' @param p_value P-value column
#' @param plot_pvalue_threshold Threshold for plotting p-value on regional association plot (default = 0.1) - reducing the number of points decreases file size and improves performance
#' @param plot_subsample_prop Proportion of points above p-value threshold to plot (default = 0.25; range = 0-1) - reducing the number of points decreases file size and improves performance
#' @param plot_distance Integer corresponding to the size of the locus that should be plotted
#' @param genome_build Character - one of "GRCh37" or "GRCh38"
#' @param population Character - one of "ALL", "AFR", "AMR", "EAS", "EUR", "SAS" referring to the reference population of interest for obtaining linkage disequilibrium information (default = "ALL")
#' @param plot_genes Logical - Include a plot of genes/transcripts within the region of interest beneath the regional association plot (default = FALSE)
#' @param plot_recombination Logical - Include a secondary y-axis of recombination rate within the region of interest
#' @param plot_title A character string corresponding to plot title (default = NULL)
#' @param plot_subtitle A character string corresponding to plot subtitle (default = NULL)
#' @param path Character string (default = NULL) - if a path is supplied a .pdf of the plot will be saved
#' @param trait (optional) Column containing the name of the trait
#'
#' @return A ggplot object containing a regional association plot for the locus of interest
#' @export
#'
#' @examples
#' \dontrun{
#' # Basic regional association plot
#' gg_locusplot(df = fto_locus_df, lead_snp = "rs62033413", rsid = rsid, chrom = chromosome, pos = position, ref = effect_allele, alt = other_allele, p_value = p_value)
#'
#' # Use "plot_genes = TRUE" to add a plot of genes within the region beneath the regional association plot
#' gg_locusplot(df = fto_locus_df, lead_snp = "rs62033413", rsid = rsid, chrom = chromosome, pos = position, ref = effect_allele, alt = other_allele, p_value = p_value, plot_genes = TRUE)
#' }
#'
gg_locusplot <- function(df, lead_snp = NULL, rsid = rsid, chrom = chrom, pos = pos, ref = ref, alt = alt, effect = NULL, std_err = NULL, p_value = p_value, trait = NULL, plot_pvalue_threshold = 0.1, plot_subsample_prop = 0.25, plot_distance = 500000, genome_build = "GRCh37", population = "ALL", plot_genes = FALSE, plot_recombination = FALSE, plot_title = NULL, plot_subtitle = NULL, path = NULL) {
  # Check input arguments to ensure they are of the correct type and within reasonable ranges
  checkmate::assert_data_frame(df)
  # checkmate::assert_string(lead_snp)
  checkmate::assert_numeric(plot_pvalue_threshold, upper = 1)
  checkmate::assert_numeric(plot_subsample_prop, lower = 0, upper = 1)
  checkmate::assert_numeric(plot_distance, lower = 0)
  checkmate::assert_logical(plot_genes)

  # trait <- rlang::enquo(trait)

  if(!rlang::quo_is_null(rlang::enquo(effect)) & !rlang::quo_is_null(rlang::enquo(std_err))) {
    checkmate::assert_numeric(df %>% pull({{ effect }}))
    checkmate::assert_numeric(df %>% pull({{ std_err }}))

    df <- df %>%
      rename(.effect = {{ effect }},
             .std_err = {{ std_err }}) %>%
      mutate(log10_pval = abs((pnorm(-abs(.effect/.std_err), log.p=TRUE) + log(2)) / log(10)))
  } else {
    df <- df %>%
      mutate(log10_pval = -log10({{ p_value }}))
  }

  if (rlang::quo_is_null(rlang::enquo(trait))) {
    df <- df %>%
      select(rsid = {{ rsid }}, chromosome = {{ chrom }}, position = {{ pos }}, ref = {{ ref }}, alt = {{ alt }}, log10_pval) %>%
      mutate_if(is.factor, as.character) %>%
      mutate(ref = stringr::str_to_upper(ref), alt = stringr::str_to_upper(alt)) %>%
      group_by(rsid) %>%
      slice_max(log10_pval) %>%
      ungroup() %>%
      tidyr::drop_na()
  } else {
    df <- df %>%
      select(rsid = {{ rsid }}, chromosome = {{ chrom }}, position = {{ pos }}, ref = {{ ref }}, alt = {{ alt }}, log10_pval, trait = {{ trait }}) %>%
      mutate_if(is.factor, as.character) %>%
      mutate(ref = stringr::str_to_upper(ref), alt = stringr::str_to_upper(alt)) %>%
      group_by(trait, rsid) %>%
      slice_max(log10_pval) %>%
      ungroup() %>%
      tidyr::drop_na()
  }


  # Create df containing information about lead SNP (by default, select SNP with lowest p-value, otherwise take user-supplied value)
  if (is.null(lead_snp)) {
    indep_snps <- df %>%
      slice_max(log10_pval, with_ties = FALSE, n = 1) %>%
      select(lead_rsid = rsid, lead_chromosome = chromosome, lead_position = position, lead_ref = ref, lead_alt = alt)

    cli::cli_alert_info("No lead_snp supplied. Defaulting to {indep_snps$lead_rsid} - {indep_snps$lead_chromosome}:{indep_snps$lead_position}, which has the lowest p-value in the region")
  } else if (!(lead_snp %in% df$rsid)) {
    # ensure lead_snp is in the supplied data; if not, use minimum p-value at locus
    indep_snps <- df %>%
      slice_max(log10_pval, with_ties = FALSE, n = 1) %>%
      select(lead_rsid = rsid, lead_chromosome = chromosome, lead_position = position, lead_ref = ref, lead_alt = alt)

    cli::cli_alert_info("Lead snp not present in supplied locus data. Defaulting to {indep_snps$lead_rsid} - {indep_snps$lead_chromosome}:{indep_snps$lead_position}, which has the lowest p-value in the region")
  } else {
    # Ensure lead_snp supplied by user is a string
    # checkmate::assert_string(lead_snp)

    indep_snps <- df %>%
      select(lead_rsid = rsid, lead_chromosome = chromosome, lead_position = position, lead_ref = ref, lead_alt = alt) %>%
      filter(lead_rsid == lead_snp) %>%
      distinct(lead_rsid, .keep_all = TRUE)
  }

  # Create dataframe of variants within the region size specified by the user
  suppressMessages(locus_snps <- df %>%
                     filter(rsid %in% indep_snps$lead_rsid) %>%
                     select(chromosome, position, lead_rsid = rsid) %>%
                     purrr::pmap_dfr(function(chromosome_filter = first, position_filter = second, lead_rsid = third) {
                       df %>%
                         filter(chromosome == chromosome_filter & between(position, position_filter - plot_distance / 2, position_filter + plot_distance / 2)) %>%
                         mutate(lead_rsid = lead_rsid) %>%
                         left_join(indep_snps)
                     }))

  # Extract LD and format colors
  possibly_ld_extract_locuszoom <- purrr::possibly(locusplotr::ld_extract_locuszoom, otherwise = NULL)

  ld_extracted <- possibly_ld_extract_locuszoom(chrom = indep_snps$lead_chromosome, pos = indep_snps$lead_position, ref = indep_snps$lead_ref, alt = indep_snps$lead_alt, start = min(locus_snps$position), stop = max(locus_snps$position), genome_build = genome_build, population = population)

  # Create dataframe with variants at locus, LD information, color codes, and labels in preparation for plotting
  if (!(is.null(ld_extracted))) {
    # Join GWAS locus df with LD information
    locus_snps_ld <- ld_extracted %>%
      select(chromosome = chromosome2, position = position2, variant2, correlation) %>%
      mutate(chromosome = as.numeric(chromosome), position = as.numeric(position)) %>%
      tidyr::separate(variant2, into = c("chr_pos", "ref_alt"), sep = "_") %>%
      tidyr::separate(ref_alt, into = c("ref", "alt"), sep = "/") %>%
      right_join(locus_snps, by = c("chromosome" = "chromosome", "position" = "position"), relationship = "many-to-many") %>%
      filter((ref.x == ref.y & alt.x == alt.y) | (ref.x == alt.y & alt.x == ref.y)) %>%
      select(-ends_with(".y"), -chr_pos) %>%
      rename_with(~ stringr::str_replace(.x, ".x", ""), .cols = ends_with(".x"))

    # Create color codes and labels
    locus_snps_ld <- locus_snps_ld %>%
      mutate(color_code = as.character(cut(as.numeric(correlation), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1), labels = c("blue4", "skyblue", "darkgreen", "orange", "red"), include.lowest = TRUE))) %>%
      mutate(legend_label = as.character(cut(as.numeric(correlation), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1), labels = c("0 - 0.2", "0.2 - 0.4", "0.4 - 0.6", "0.6 - 0.8", "0.8 - 1"), include.lowest = TRUE))) %>%
      mutate(lead = rsid == lead_rsid) %>%
      mutate(label = case_when(
        rsid == lead_rsid ~ lead_rsid,
        TRUE ~ NA_character_
      )) %>%
      mutate(color_code = case_when(
        rsid == lead_rsid ~ "purple",
        TRUE ~ color_code
      )) %>%
      mutate(color_code = forcats::fct_expand(color_code, "purple", "red", "orange", "darkgreen", "skyblue", "blue4")) %>%
      mutate(color_code = forcats::fct_relevel(color_code, "purple", "red", "orange", "darkgreen", "skyblue", "blue4")) %>%
      mutate(legend_label = case_when(
        rsid == lead_rsid ~ "Ref",
        TRUE ~ legend_label
      )) %>%
      mutate(legend_label = forcats::fct_expand(legend_label, "Ref", "0.8 - 1", "0.6 - 0.8", "0.4 - 0.6", "0.2 - 0.4", "0 - 0.2")) %>%
      mutate(legend_label = forcats::fct_relevel(legend_label, "Ref", "0.8 - 1", "0.6 - 0.8", "0.4 - 0.6", "0.2 - 0.4", "0 - 0.2"))
  } else {
    # Deal with scenario where lead variant is not present in LD database
    cli::cli_alert_info("No linkage disequilibrium information found")
    locus_snps_ld <- locus_snps %>%
      mutate(correlation = NA_integer_) %>%
      mutate(lead = rsid == lead_rsid) %>%
      mutate(label = case_when(
        rsid == lead_rsid ~ lead_rsid,
        TRUE ~ NA_character_
      )) %>%
      mutate(color_code = case_when(
        rsid == lead_rsid ~ "purple",
        TRUE ~ "grey50"
      )) %>%
      mutate(legend_label = case_when(
        rsid == lead_rsid ~ "Ref",
        TRUE ~ "Other"
      ))
  }

  # group locus by trait if necessary
  if (!rlang::quo_is_null(rlang::enquo(trait))) {
    locus_snps_ld <- locus_snps_ld %>%
      group_by(.data = ., trait)
    }

  locus_snps_ld_label <- locus_snps_ld %>% filter(!is.na(label))

  # Make plot (sample non-significant p-values to reduce overplotting)
  regional_assoc_plot <- locus_snps_ld %>%
                     distinct(rsid, .keep_all = TRUE) %>%
                     filter(log10_pval > -log10(plot_pvalue_threshold) | correlation > 0.2 | legend_label == "Ref") %>% # improve overplotting
                     bind_rows(locus_snps_ld %>%
                                 filter(log10_pval <= -log10(plot_pvalue_threshold) & correlation < 0.2 & legend_label != "Ref") %>%
                                 slice_sample(prop = plot_subsample_prop)) %>%
                     arrange(color_code, log10_pval) %>%
                     ggplot(aes(position, log10_pval)) +
                     geom_point(aes(fill = factor(color_code), size = lead, alpha = lead, shape = lead)) +
                     ggrepel::geom_label_repel(data = locus_snps_ld_label, aes(label = label),
                                               size = 4,
                                               color = "black",
                                               fontface = "bold",
                                               fill = "white",
                                               min.segment.length = 0,
                                               box.padding = 1,
                                               alpha = 1,
                                               nudge_y = 4
                     ) +
                     geom_hline(yintercept = -log10(5e-8), linetype = "dashed") +
                     scale_fill_identity(parse(text = "r^2"), guide = "legend", labels = levels(forcats::fct_drop(locus_snps_ld$legend_label)), na.translate = FALSE) +
                     scale_size_manual(values = c(3, 5), guide = "none") +
                     scale_shape_manual(values = c(21, 23), guide = "none") +
                     scale_alpha_manual(values = c(0.8, 1), guide = "none") +
                     scale_x_continuous(breaks = scales::extended_breaks(n = 5), labels = scales::label_number(scale = 1 / 1e6)) +
                     scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
                     guides(fill = guide_legend(override.aes = list(shape = 22, size = 6),
                                                position = "inside")) +
                     labs(
                       title = plot_title,
                       subtitle = plot_subtitle,
                       x = glue::glue("Position on Chromosome {unique(indep_snps$lead_chromosome)} (Mb)"),
                       y = "-log<sub>10</sub>(P-value)"
                     ) +
                     theme_bw(base_size = 16) +
                     theme(
                       plot.title = element_text(face = "bold"),
                       legend.text = element_text(size = 10),
                       legend.title = element_text(size = 10, hjust = 0.5),
                       legend.justification.inside = c("right", "top"),
                       legend.position.inside = c(0.99, 0.99),
                       strip.text = element_text(color = "black"),
                       strip.text.x = element_blank(),
                       axis.title.y = ggtext::element_markdown(),
                       legend.spacing.y = unit(0, "pt")
                     )

  if (!rlang::quo_is_null(enquo(trait))) {
    regional_assoc_plot <- regional_assoc_plot +
      facet_grid(rows = vars(trait), scales = "free_y")
  }

  if (plot_recombination) {
    cli::cli_alert_info("Extracting recombination rates for the region {indep_snps$lead_chromosome}:{indep_snps$lead_position - plot_distance/2}-{indep_snps$lead_position + plot_distance/2}")
    ylim <- max(pull(locus_snps_ld, log10_pval), na.rm = TRUE) +
      0.3 * max(pull(locus_snps_ld, log10_pval), na.rm = TRUE)

    recomb_df <- recomb_extract_locuszoom(chrom = indep_snps$lead_chromosome, start = indep_snps$lead_position - plot_distance / 2, end = indep_snps$lead_position + plot_distance / 2, genome_build = genome_build) %>%
      select(position, recomb_rate)
    # return(recomb_df)
    suppressMessages(
      regional_assoc_plot <- regional_assoc_plot +
      geom_line(data = recomb_df, mapping = aes(x = position, y = recomb_rate), color = "lightblue", linewidth = 0.5) +
      scale_y_continuous(
        name = "-log<sub>10</sub>(P-value)",
        limits = c(0, ylim),
        sec.axis = sec_axis(
          ~. * (100 / ylim),
          name = "Recombination rate (cM/Mb)"
          # labels = scales::percent_format()
        )
      ) +
        theme(axis.title.y.right = element_text(vjust = 1.5))
      )

    regional_assoc_plot <- gginnards::move_layers(regional_assoc_plot, "GeomLine", "bottom")

  }

  # Add plot of genes if requested by user
  if (plot_genes) {
    cli::cli_alert_info("Extracting genes for the region {indep_snps$lead_chromosome}:{indep_snps$lead_position - plot_distance/2}-{indep_snps$lead_position + plot_distance/2}")
    geneplot <- gg_geneplot(chr = indep_snps$lead_chromosome, start = indep_snps$lead_position - plot_distance / 2, end = indep_snps$lead_position + plot_distance / 2, genome_build = genome_build) +
      theme(plot.margin = margin(0, 5.5, 5.5, 5.5))

    suppressWarnings(suppressMessages(regional_assoc_plot <- patchwork::wrap_plots(list(
      regional_assoc_plot +
        labs(x = "") +
        xlim(indep_snps$lead_position - plot_distance / 2, indep_snps$lead_position + plot_distance / 2) +
        theme(
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.title.x = element_blank(),
          plot.margin = margin(5.5, 5.5, 0, 5.5)
        ),
      geneplot
    ), nrow = 2, heights = c(3, 1))))
  }

  # Return +/- save ggplot object
  if (!is.null(path)) {
    ggsave(regional_assoc_plot, filename = paste0(path, stringr::str_replace_all(unique(indep_snps$lead_rsid), "[^[:alnum:]]", "_"), ".pdf"), units = "in", height = 8.5, width = 11, device = "pdf")
  }
  # } else {
  #   ggsave(regional_assoc_plot, filename = paste0(path, stringr::str_replace_all(unique(indep_snps$lead_rsid), "[^[:alnum:]]", "_"), ".pdf"), units = "in", height = 8.5, width = 11, device = "pdf")
  #   return(regional_assoc_plot)
  # }

  return(regional_assoc_plot)
}
mglev1n/locusplotr documentation built on July 31, 2024, 8:41 p.m.