ClusterBurden: Rare variant association utilising burden and positional information.

Documented in plot_signif_distribs

#' Plot the distributions of case and control variant residues across multiple genes
#'
#' @author Adam Waring - adam.waring@@msdtc.ox.ac.uk
#' @return Returns a ggplot object
#'
#' @param pvals pvals data.table from ClusterBurden WES analysis with at least symbol and p-value column for test of interest
#' @param cases case data in format: data.table(aff, symbol, protein_position, ac)
#' @param controls control data in format: data.table(aff, symbol, protein_position, ac)
#' @param case_coverage optional coverage data for cases in format: data.table(symbol, protein_position, over_10)
#' @param control_coverage optional coverage data for controls in format: data.table(symbol, protein_position, over_10)
#' @param test either bin-test, burden or clusterburden (not case sensitive)
#' @param cov_threshold threshold at which to exclude a residue position from the analysis (choose 0 to keep all residues)
#' @param n_genes number of genes to plot e.g. n_genes = 5 plots the 5 most significant genes
#' @param SCALE scale of the plot
#'
#' @keywords RVAT, distribution, cluster, genetics, case-control, gene
#'
#' @details For each gene (n_genes=?): produces a stripchart using geom_jitter for both cases and controls as well as a gaussian density line for each cohort. If coverage files are provided then these are included as a rug underneath and above the case and control variants and are
#' colored by their coverage level (< cov_threshold, < 80% 10X, < 90% 10X, > 90% 10X)
#'
#' @export



plot_signif_distribs = function(pvals, cases, controls, case_coverage, control_coverage, test="BIN-test", cov_threshold=0.5, n_genes=20, SCALE=1){

  test = c("bin-test"="BIN.test_pvalue", "burden"="burden_pvalue", "clusterburden"="ClusterBurden")[tolower(test)]

  point_size = 2 * SCALE
  line_size = 1.5 * SCALE
  cov_size = 3 * SCALE
  theme_size = 25 * SCALE


  top_pvals = pvals[order(pvals[[test]])][1:n_genes]
  top_genes = top_pvals$symbol

  dataset = rbind(cases, controls)[symbol%in%top_genes]
  dataset[,y:=aff+1]

  case_prov = attributes(cases)$provenance
  control_prov = attributes(controls)$provenance

  case_auto = !is.null(case_prov) && case_prov == "auto"
  control_auto = !is.null(control_prov) && control_prov == "auto"

  if(case_auto) case_coverage = attributes(cases)$coverage
  if(control_auto) control_coverage = attributes(controls)$coverage


  null1 = !is.null(case_coverage)
  null2 = !is.null(control_coverage)
  coverage = NULL
  if(null1){
    coverage = rbind(coverage, cbind(case_coverage[symbol%in%top_genes], aff=1, y=3))
  }
  if(null2){
    coverage = rbind(coverage, cbind(control_coverage[symbol%in%top_genes], aff=0, y=0))
  }


  dataset[,color := ifelse(aff==2, "Cases X10 > 80%", "Controls X10 > 80%")]
  COLORS = setNames(c("#D8B70A", "#81A88D"),
                    c("Cases X10 > 80%", "Controls X10 > 80%"))

  if(null1|null2){
    blackname = paste0("Blacklisted (X10 < ", cov_threshold, ")")
    coverage[,color := ifelse(over_10 < cov_threshold, blackname, ifelse(over_10 < 0.8, "X10 < 80%", ifelse(over_10 < 0.9, "X10 < 90%", ifelse(aff==1, "Cases X10 > 80%", "Controls X10 > 80%"))))]
    COLORS = c(COLORS, setNames(c("black", "darkred", "red"),
                                c(blackname, "X10 < 80%",  "X10 < 90%")))

    dataset = merge(dataset, coverage[,.(symbol, protein_position, aff, over_10)], by=c("symbol", "protein_position", "aff"), all.x=T)
    dataset = merge(dataset, unique(coverage[color == blackname,.(symbol, protein_position, black=over_10)]), by=c("symbol", "protein_position"), all.x=T)
    dataset[,color := ifelse(!is.na(black), blackname, ifelse(over_10 < 0.8, "X10 < 80%", ifelse(over_10 < 0.9, "X10 < 90%", ifelse(aff==1, "Cases X10 > 80%", "Controls X10 > 80%"))))]
  }


  g1 = ggplot(dataset, aes(y = y, x = protein_position, color=color)) +

    geom_jitter(alpha=0.3, size=point_size) +

    facet_wrap(~symbol, scales = "free_x") +

    stat_density(geom="line", data=dataset[aff==1], aes(y=1.5 + ..scaled..), size=line_size, color="#D8B70A") +
    stat_density(geom="line", data=dataset[aff==0], aes(y=0.5 + ..scaled..), size=line_size, color="#81A88D") +

    scale_color_manual(name="", values=COLORS) +

    labs(x = "Index in linear protein sequence", y = "") +

    theme_bw(theme_size) + theme(legend.position="top")

  if(null1|null2){
    g1 = g1 +  geom_point(data=coverage, aes(color=color), size=cov_size, shape=15) +
      geom_point(data=coverage[color=="X10 < 80%"], aes(color=color), size=cov_size, shape=15) +
      geom_point(data=coverage[color=="X10 < 90%"], aes(color=color), size=cov_size, shape=15) +
      geom_point(data=coverage[color==blackname], aes(color=color), size=cov_size, shape=15)
  }

  if(null1 & null2){

    g1 = g1 + scale_y_continuous(breaks = 0:3, limits = c(-0.25, 3.25), labels = c("Control coverage", "Control variants", "Case variants", "Case coverage"))

  } else if(null2 & !null1){

    g1 = g1 + scale_y_continuous(breaks = 0:2, limits = c(-0.25, 2.5), labels = c("Control coverage", "Control variants", "Case variants"))

  } else if(null1 & !null2){

    g1 = g1 + scale_y_continuous(breaks = 1:3, limits = c(0.5, 3.25), labels = c("Control variants", "Case variants", "Case coverage"))

  } else {

    g1 = g1 + scale_y_continuous(breaks = 1:2, limits = c(0.5, 2.5), labels = c("Control variants", "Case variants"))

  }

  g1
  return(g1)


}

adamwaring/ClusterBurden documentation built on July 29, 2020, 9:50 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

adamwaring/ClusterBurden
Rare variant association utilising burden and positional information.

R/plot_signif_distribs.r
In adamwaring/ClusterBurden: Rare variant association utilising burden and positional information.

Defines functions plot_signif_distribs

Documented in plot_signif_distribs

R Package Documentation

Browse R Packages

We want your feedback!

adamwaring/ClusterBurden Rare variant association utilising burden and positional information.

R/plot_signif_distribs.r In adamwaring/ClusterBurden: Rare variant association utilising burden and positional information.

Defines functions plot_signif_distribs

Documented in plot_signif_distribs

R Package Documentation

Browse R Packages

We want your feedback!

adamwaring/ClusterBurden
Rare variant association utilising burden and positional information.

R/plot_signif_distribs.r
In adamwaring/ClusterBurden: Rare variant association utilising burden and positional information.