R/methplot.r

#' @docType package
#' @name methplot
#' @title Exploratory plots of genomic methylation data.
#' @import dplyr
NULL
#' @import data.table
NULL
#' @import ggplot2
NULL
#' @import doMC
NULL

.onAttach <- function(libname, pkgname){
  packageStartupMessage("Welcome to methplot!")
}

# Small helper functions -----------------------
#' Return subset of autosomes.
#'
#' @family utility functions
#' @param this_table A character.
#' @return data.frame
autosomes <- function(this_table){
  autosomes <- paste("chr", 1:22, sep="") %>% c(., 1:22)
  return(this_table[which(as.character(data.frame(this_table)[,"chr"]) %in% autosomes), ])
}
#' Return subset of autosomes plus X and Y.
#'
#' @family utility functions
#' @param this_table A character.
#' @return data.frame
normal_chroms <- function(this_table){
  auto_x_y <- paste("chr", 1:22, sep="") %>% c(., 1:22) %>% c(., paste("chr", c("X", "Y"), sep="")) %>% c(., "X", "Y")
  return(this_table[which(as.character(data.frame(this_table)[,"chr"]) %in% auto_x_y), ])
}
#' Import and format grch37 gene model file.
#'
#' @family utility functions
#' @param grch37_table_path A character.
#' @return data.frame
get_grch_gene_table <- function(grch37_table_path){
  grch37_table <- readr::read_delim(grch37_table_path, delim="\t", col_names=F)
  names(grch37_table) <- c("id", "source", "type", "start", "end", "dot", "strand", "dot_2", "info")
  chrs <- stringr::str_match(grch37_table$id, "NC_0+(.+)\\.")[,2]
  chrs[which(chrs=="12920")] <- "M"
  chrs[which(chrs=="23")] <- "X"
  chrs[which(chrs=="24")] <- "Y"
  chrs <- paste("chr", chrs, sep="")
  grch37_table <- grch37_table %>%
                  dplyr::mutate(chr = chrs) %>%
                  dplyr::filter(type=="gene") %>%
                  dplyr::mutate(gene_name = str_match(.$info, "Name=(.+?);")[,2])

  out <- grch37_table[grep("pseudo=false", grch37_table$info, invert=TRUE), ]
  return(out)
}
#' Import and format UCSC refgene gene model file.
#'
#' @family utility functions
#' @param refgene_path A character.
#' @return data.frame
get_ucsc_refgene_table <-function(refgene_path){
  data.table::fread(refgene_path, sep="\t") %>%
  dplyr::rename(chr = chrom,
         start = txStart,
         end = txEnd,
         gene_name = name2) %>%
         dplyr::mutate(group = basename(refgene_path)) %>%
         return
}
#' Import and format methyl sequencing file.
#'
#' @family utility functions
#' @param meth_path A character.
#' @return data.table
#' @export
get_meth_table <- function(meth_path){
  this_meth_table <- data.table::fread(meth_path, sep="\t") %>%
                       dplyr::rename(chr=V1, start=V2, end=V3, perc_meth=V4, meth=V5, unmeth=V6)
}
#' Get longest possible gene model from refgene.
#'
#' @family utility functions
#' @param ref_table A data.frame or data.table.
#' @return data.table or data.frame
longest_refgene_transcripts <- function(ref_table){
  ref_table %>%
    # some genes are duplicated, so multiple groups must be used:
    dplyr::group_by(gene_name, chr, strand) %>%
    dplyr::summarise(tss = min(start),
              tes = max(end)) %>%
    # autosomes %>%
    normal_chroms %>%
    # .[which(!duplicated(.$gene_name)), ] %>%
    return
}

# Workhorse functions -----------------------------

#' Get ranges of bins from BED element centers.
#'
#' @family workorse functions
#' @param bed_table_path A character.
#' @param bin_width A numeric.
#' @param range A numeric.
#' @return data.table
#' @export
get_bin_ranges_bed <- function(bed_table_path, bin_width, range){
  path_basename <- basename(bed_table_path)
  bed_table <- data.table::fread(bed_table_path, sep="\t")
  bed_table <- bed_table %>% dplyr::rename(chr = V1, start = V2, end = V3)
  bed_table$midpoints <- with(bed_table, round((start + end)/2))
  bin_start <- seq(from = -range, to = (range - bin_width), by=bin_width)
  expanded_bed_table <- bed_table[rep(seq_len(nrow(bed_table)), each=length(bin_start)),]
  bin_start_rep <- rep(bin_start, nrow(bed_table))
  expanded_bed_table$bin_start <- bin_start_rep
  expanded_bed_table$bin_end  <- expanded_bed_table$bin_start + bin_width
  expanded_bed_table %>%
    dplyr::transmute(chr   = chr,
              start = midpoints + bin_start,
              end   = midpoints   + bin_end,
              bin_start = bin_start,
              bin_end   = bin_end,
              group = path_basename) %>%
    return
}
#' Get bin ranges for gene TSSs
#'
#' @family workorse functions
#' @param refgene_path A character.
#' @param range A numeric.
#' @param bin_width A numeric.
#' @return data.table
#' @export
get_bin_ranges_refgene <- function(refgene_path, range, bin_width){
  refgene_table <- get_ucsc_refgene_table(refgene_path)
  collapsed_refgene <- longest_refgene_transcripts(refgene_table)
  bin_start <- seq(from=-range, to=(range - bin_width), by=bin_width)
  bin_end  <- bin_start + bin_width- 1

  # expanded_refgene <- collapsed_refgene[rep(seq_len(nrow(collapsed_refgene)), each=length(bin_start)), ]

  rep_indices <- nrow(collapsed_refgene) %>%
                      seq_len %>%
                      rep(., each=length(bin_start))
  expanded_refgene <- collapsed_refgene %>% dplyr::slice(rep_indices)

  expanded_refgene$bin_start <- rep(bin_start, nrow(collapsed_refgene))
  expanded_refgene$bin_end <- expanded_refgene$bin_start + bin_width

  # Initialize bin boundaries
  expanded_refgene$start <- 0
  expanded_refgene$end   <- 0

  # Add plus bin boundaries
  plus_rows  <- which(expanded_refgene$strand == "+")
  expanded_refgene[plus_rows, ]$start <- expanded_refgene$tss[plus_rows] + expanded_refgene$bin_start[plus_rows]
  expanded_refgene[plus_rows, ]$end   <- expanded_refgene$tss[plus_rows] + expanded_refgene$bin_end[plus_rows]

  # Add minus row bin boundaries -- expecting the column annotated "tes" to be
  #   the biological TSS for genes on the "-" strand (i.e. annotated TSS
  #   position will always be < annotated TES position)
  minus_rows <- which(expanded_refgene$strand == "-")
  expanded_refgene[minus_rows, ]$start <- expanded_refgene$tes[minus_rows] - expanded_refgene$bin_end[minus_rows]
  expanded_refgene[minus_rows, ]$end   <- expanded_refgene$tes[minus_rows] - expanded_refgene$bin_start[minus_rows]

  expanded_refgene$group <- refgene_table$group[1]
  return(expanded_refgene %>% select(gene_name, chr, strand, start, end, bin_start, bin_end, group))
}
#' Get range overlaps
#'
#' @family workorse functions
#' @param bin_ranges A data.frame.
#' @param bed_path A character.
#' @return data.table
get_genome_range_overlaps <- function(bin_ranges, bed_path){
  bin_ranges <- data.table(bin_ranges)
  setkey(bin_ranges, chr, start, end)
  bed <- data.table::fread(bed_path)
  names(bed)[1:3] <- c("chr", "start", "end")
  capture_overlaps <- data.table::foverlaps(bed, bin_ranges, nomatch=0L)
  return(capture_overlaps)
}
#' Get percent methylation table by bin.
#'
#' @family workorse functions
#' @param bin_ranges A data.frame.
#' @param meth_table A data.table.
#' @return data.table
#' @export
get_binned_perc_meth <- function(ranges, meth_table, abs = FALSE){
  # Use absolute distance from the element centers (not strand-specific)
  if(abs == TRUE){
    abs_neg_bin_start <- abs(ranges[which((abs(ranges$bin_start) > abs(ranges$bin_end))), ]$bin_start)
    abs_neg_bin_end   <- abs(ranges[which((abs(ranges$bin_start) > abs(ranges$bin_end))), ]$bin_end)
    negative_indices <- which((abs(ranges$bin_start) > abs(ranges$bin_end)))
    ranges[negative_indices, ]$bin_start <- abs_neg_bin_end
    ranges[negative_indices, ]$bin_end   <- abs_neg_bin_start
  }

  ranges <- data.table(ranges)
  data.table::setkey(ranges, chr, start, end)
  capture_overlaps <- data.table::foverlaps(meth_table, ranges, type="any", nomatch=0L)
  output <- capture_overlaps %>%
    group_by(bin_start, bin_end) %>%
    dplyr::group_by(bin_start, bin_end) %>%
    dplyr::summarise(meth = sum(meth), unmeth = sum(unmeth), cpg_count = length(meth), group = group[1]) %>% 
    dplyr::mutate(perc_meth = meth / (meth + unmeth), depth = (meth + unmeth) / cpg_count) %>%
    dplyr::arrange(bin_start) %>%
    return
}
#' Get percent methylation table by bin for list of meth tables.
#'
#' @family workorse functions
#' @param bin_ranges A data.frame.
#' @param meth_table_list A list of data.tables.
#' @param cores An integer numeric.
#' @return list
#' @export
get_binned_perc_meth_list <- function(bin_ranges, meth_table_list, abs = FALSE, cores = 1){
  dt_names <- names(meth_table_list)
  binned_perc_meth_list <- mclapply(dt_names, function(dt_name){
    this_binned_perc_meth <- get_binned_perc_meth(bin_ranges, meth_table_list[[dt_name]], abs = abs)
    this_binned_perc_meth$group <- dt_name
    gc()
    return(this_binned_perc_meth)
  }, mc.cores = cores)
  names(binned_perc_meth_list) <- dt_names
  return(binned_perc_meth_list)
}
#' Take BED-formatted single base coords from arbitrary number of supplied tables and return base intersection 
#'
#' @family workorse functions
#' @param ... A list of meth tables (data.tables).
#' @return A list of data.tables.
#' @export
get_intersect_single_bases <- function(...){
  meth_tables_list <- as.list(...)
  names <- deparse((substitute(...)))
  table_names <- names %>% str_replace("list\\(", "") %>% str_replace("\\)", "") %>% strsplit(", ") %>% unlist
  intersect_cols <- Reduce(function(x, y) merge(x, y, by = c("chr", "start", "end")), meth_tables_list) %>% select(chr, start, end)
  intersect_tables_list <- lapply(meth_tables_list, function(x){
    merge(intersect_cols, x)
  })
  names(intersect_tables_list) <- table_names
  return(intersect_tables_list)
}
#' Quickly get the overlaps between two bed-like tables via data splitting and parallel processing
#'
#' @family workhorse functions
#' @param table_1 A data.table.
#' @param table_2 A data.table.
#' @param cores An integer numeric. 
#' @return data.table
parallel_genomic_intersect <- function(table_1, table_2, cores = 1){
  doMC::registerDoMC(cores = cores)
  table_1_chrs <- table_1$chr %>% 
    as.factor %>% 
    levels
  table_2_chrs <- table_2$chr %>% 
    as.factor %>% 
    levels

  intersect_chrs <- table_1_chrs[table_1_chrs %in% table_2_chrs]

  capture <- plyr::adply(intersect_chrs, 1, function(x){
    table_1_sub <- table_1 %>% dplyr::filter(chr == x)
    table_2_sub <- table_2 %>% dplyr::filter(chr == x)
    data.table::setkey(table_1_sub, chr, start, end)
    return(data.table::foverlaps(table_2_sub, table_1_sub, type="any", nomatch=0L))
  }, .parallel = TRUE, .id = NULL)
  
  doMC::registerDoMC(cores = 1)
  return(data.table(capture))
}
#' Get the rows from table_1 with coords that don't intersect the coords in table_2
#'
#' @family workorse functions
#' @param table_1 A data.table in BED format.
#' @param table_2 A data.table in BED format.
#' @return A data.table.
genomic_complement <- function(table_1, table_2){
  data.table::setkey(table_2, chr, start, end)
  intersect_indices <- foverlaps(table_1, table_2, which = TRUE, type="any", nomatch = 0L)
  all_indices <- 1:nrow(table_1)
  return(table_1[!(all_indices %in% intersect_indices$xid), ])
}
#' Get the rows from table_1 with coords that don't intersect the coords in table_2 parallelized by chrom
#'
#' @family workorse functions
#' @param table_1 A data.table in BED format.
#' @param table_2 A data.table in BED format.
#' @param cores An integer numeric.
#' @return data.table
parallel_genomic_complement <- function(table_1, table_2, cores = 1){
  doMC::registerDoMC(cores = cores)
  table_1_chrs <- table_1$chr %>% 
    as.factor %>% 
    levels
  table_2_chrs <- table_2$chr %>% 
    as.factor %>% 
    levels

  intersect_chrs <- table_1_chrs[table_1_chrs %in% table_2_chrs]

  capture <- plyr::adply(intersect_chrs, 1, function(x){
    table_1_sub <- table_1 %>% dplyr::filter(chr == x)
    table_2_sub <- table_2 %>% dplyr::filter(chr == x)
    # data.table::setkey(table_1_sub, chr, start, end)
    data.table::setkey(table_2_sub, chr, start, end)
    intersect_indices <- data.table::foverlaps(table_1_sub, table_2_sub, which = TRUE, type="any", nomatch=0L)
    all_indices <- 1:nrow(table_1_sub)
    return(table_1_sub[!(all_indices %in% intersect_indices$xid), ])
  }, .parallel = TRUE, .id = NULL) 

  doMC::registerDoMC(cores = 1)
  return(data.table(capture))
}
#' Get overlap enrichment in ranges for generic BED-formatted file (e.g. peak ranges)
#'
#' @family workorse functions
#' @param ranges A data.frame or data table.
#' @param bed_path A character.
#' @return data.table
#' @export
get_aggregate_bed_enrichment_over_ranges <- function(ranges, bed_path){
  overlap <- get_genome_range_overlaps(ranges, bed_path)
  overlap$overlap_start  <- pmax(overlap$start, overlap$i.start)
  overlap$overlap_end    <- pmin(overlap$end, overlap$i.end)
  overlap$overlap_length <- overlap$overlap_end - overlap$overlap_start
  overlap_collapsed      <- overlap %>%
    dplyr::group_by(bin_start, bin_end) %>%
    dplyr::summarise(aggregate_bases_overlap = sum(overlap_length)) %>%
    dplyr::arrange(bin_start)

  overlap_collapsed$group <- "chip_enrichment"
  overlap_collapsed$meth <- NA
  overlap_collapsed$unmeth <- NA
  overlap_collapsed$cpg_count <- NA
  overlap_collapsed$depth <- NA
  overlap_collapsed$perc_meth <- NA
  overlap_collapsed <- overlap_collapsed %>% dplyr::transmute(bin_start, bin_end, meth, unmeth, cpg_count, group, perc_meth, depth, aggregate_bases_overlap)

  return(overlap_collapsed)
}
#' Concatenate binned_perc_meth table with aggreegate enrichment table (e.g. from ChIP peaks), addiding dummy columns as required.
#'
#' @family workorse functions
#' @param binned_perc_meth_table A data.table.
#' @param chip_enrich_table A data.table.
#' @return data.table
#' @export
add_chip_enrich_to_binned_perc_meth <- function(binned_perc_meth_table, chip_enrich_table){
  binned_perc_meth_table$aggregate_bases_overlap <- NA
  bound <- rbind(binned_perc_meth_table, chip_enrich_table)
  return(bound)
}

# Higher level functions --------------------------

#' Get percent methylation table around gene model TSSs.
#'
#' @family high level functions
#' @param refgene_path A character.
#' @param meth_table A data.frame.
#' @param range A numeric.
#' @param bin_width A numeric.
#' @return data.table
#' @export
get_tss_perc_meth <- function(refgene_path, meth_table, range, bin_width){
  if(range < bin_width) stop("Stopped: Range is less than bin_width!")
  bin_ranges_refgene <- get_bin_ranges_refgene(refgene_path, range = range, bin_width = bin_width)
  get_binned_perc_meth(bin_ranges_refgene, meth_table) %>% return
}
#' Mask CpGs in repeats
#'
#' @family high level functions
#' @param repeat_mask_ucsc_path A character.
#' @param meth_table A data.table.
#' @return data.table
#' @export
repeat_mask_meth_table <- function(repeat_mask_ucsc_path, meth_table){
  repeat_mask <- fread(repeat_mask_ucsc_path)
  repeat_mask <- repeat_mask %>% dplyr::select(genoName, genoStart, genoEnd)
  names(repeat_mask) <- c("chr", "start", "end")
  return(genomic_complement(meth_table, repeat_mask))
}

# Plotting functions ------------------------------

#' Plot binned aggregate enrichments arround genomic-range-defined (BED-defined) centers.
#'
#' @family plotting functions
#' @param ranges A data.frame or data table.
#' @param bed_path A character.
#' @return ggplot
#' @export
plot_generic_aggregate_enrichment <- function(ranges, bed_path){
  enrichment_table <- get_aggregate_bed_enrichment_over_ranges(ranges, bed_path)
  enrichment_table <- enrichment_table %>% dplyr::mutate(bin_mid = ceiling((bin_end + bin_start)/2))
  enrichment_table %>% head %>% print
  this_plot <- ggplot2::ggplot(enrichment_table, aes(bin_mid, aggregate_bases_overlap)) + ggplot2::geom_area()
  return(this_plot)
}
#' Plot binned percent meth table.
#'
#' @family plotting functions
#' @param binned_perc_meth_table A data.frame or data table.
#' @return ggplot
#' @export
plot_percent_meth <- function(binned_perc_meth_table){
  binned_perc_meth_table$bin_mid <- with(binned_perc_meth_table, (bin_start + bin_end)/2)
  this_plot <- ggplot2::ggplot(binned_perc_meth_table, aes(bin_mid, perc_meth)) +
    ggplot2::geom_line(aes(color=group)) +
    ggplot2::ylab("Percent Methylation") +
    ggplot2::xlab("Distance from Center (bp)") +
    ggplot2::scale_x_continuous(expand=c(0,0)) +
    ggplot2::scale_y_continuous(expand=c(0,0.0001), limits = c(0,1)) +
    ggplot2::theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          # legend.position="none",
          legend.key = element_blank(),
          axis.line = element_line(),
          panel.background = element_blank(),
          axis.text.x = element_text(color="black", size=12),
          axis.title.x = element_text(color="black", size=14, vjust=-1.5),
          axis.text.y = element_text(color="black", size=12),
          axis.title.y = element_text(color="black", size=14, vjust=3),
          plot.margin = grid::unit(c(1,1,1,1), "cm"),
          panel.border=element_blank(),
          axis.ticks=element_line(size=0.6, color="black"))

  this_plot %>% return
}
#' Plot binned percent meth table with a depth (as number of CpGs assayed) facet.
#'
#' @family plotting functions
#' @param binned_perc_meth_table A data.frame or data table.
#' @return ggplot
#' @export
plot_percent_meth_with_depth <- function(binned_perc_meth_table){
  binned_perc_meth_table$bin_mid <- with(binned_perc_meth_table,
                                         (bin_start + bin_end)/2)
  merged_df <- melt(binned_perc_meth_table, id.vars = c("bin_start", "bin_end", "meth", "unmeth", "group", "bin_mid"))
  this_plot <- ggplot2::ggplot(merged_df) +
    ggplot2::geom_line(aes(x = bin_mid, y = value, color = group)) +
    ggplot2::geom_line(aes(x = bin_mid, y = value, color = group)) +
    ggplot2::facet_grid(variable~., scales = "free_y")

  this_plot %>% return
}

# Unfinished functions -------------------------

#' Get gene quantiles for single gene.
#'
#' @family plotting functions
#' @param trancript_line A data.frame or data table .
#' @param quantiles A numeric.
#' @return data.table
get_gene_quantiles  <- function(trancript_line, quantiles = 100){
  tss <- transcript_line$tss
  tes <- transcript_line$tes
  chr <- transcript_line$chr
  strand <- transcript_line$strand
  transcript_line <- longest_transcripts[1,]
  quantile_step <- ((tes - tss)/quantiles) %>%
    round(., digits=0)

  start <- c(tss, tss + cumsum(rep(quantile_step, quantiles-1)))
  end   <- c(start + quantile_step) + 1

  if(strand == "-"){
    start <- rev(start)
    end <- rev(end)
  }

  bin_start <- 0:(length(start)-1)
  bin_end <-   bin_start + 1
  group <- paste(quantiles, "geneBodyQuantiles", sep="")

  quantile_df <- data.frame(chr, start, end, bin_start, bin_end, group)
  quantile_df %>% return
}
tubuliferous/methplot documentation built on June 1, 2019, 2:52 a.m.