R/functions.R

#' Parse a genomic region into parts.
#'
#' @param region A string like "chr1:123-456".
#' @return A list with elements "chrom", "start", and "end".
#' @export
parse_region <- function(region) {
  xs <- strsplit(region, ":")[[1]]
  chrom <- xs[1]
  xs <- as.numeric(strsplit(xs[2], "-")[[1]])
  return(list("chrom" = chrom, "start" = xs[1], "end" = xs[2]))
}

#' Read a bedGraph file with 4 columns: chrom, start, end, score
#'
#' @param filename The file to read. No column names.
#' @param ... Additional parameters passed to \code{\link[readr]{read_tsv}}
#' @return A GRanges object.
#' @export
#' @importFrom readr read_tsv
read_bedGraph <- function(filename, ...) {
  dat <- read_tsv(
    filename, col_names = FALSE, ...)
  colnames(dat) <- c("chrom", "start", "end", "score")
  makeGRangesFromDataFrame(
    df = dat,
    keep.extra.columns = TRUE,
    ignore.strand = TRUE,
    starts.in.df.are.0based = TRUE
  )
}

#' Read a text file output by FIMO and select sites that meet a significance
#' threshold.
#'
#' @param fimo_file A FIMO text file with PWM match sites.
#' @param pvalue Select FIMO matches with p-value less than this.
#' @return A dataframe with one row per site.
#' @details
#'    For sites represented multiple times, the one with the maximum
#'    score is selected.
#' @export
#' @importFrom readr read_tsv
#' @importFrom janitor clean_names 
read_fimo <- function(fimo_file, pvalue = 1e-4, ...) {
  # Read the PWM sites.
  sites <- clean_names(read_tsv(fimo_file, ...))

  # Discard poor matches.
  sites <- sites[sites$p_value < pvalue, , drop = FALSE]

  # For sites represented multiple times, select the one with the max score.
  sites$region <- paste(sites$sequence_name, sites$start, sites$stop)

  # Sort sites according to regions and descending score.
  sites <- sites[with(sites, order(sequence_name, start, stop, -score)), ]

  # Find duplicated regions and remove the ones with lower scores.
  dups <- duplicated(sites$region)
  if(sum(dups) > 0) {
    sites <- sites[-which(dups), ]
  }

  # Drop the region column.
  sites <- sites[, !colnames(sites) %in% c("region")]

  # Ensure we have some matches.
  if (nrow(sites) == 0) {
    stop(sprintf("No significant sites for '%s'", fimo_file))
  }

  return(sites)
}

#' Count readstarts at each nucleotide position in each BED region.
#'
#' @param bam_file A BAM file with mapped DNase-Seq data.
#' @param fimo_file A FIMO text file with PWM match sites.
#' @param pvalue Select FIMO matches with p-value less than this.
#' @return A list with two items "mat" and "regions".
#' @details
#'    The returend matrix "mat" has one row for each region in the BED
#'    file, and one column for each genomic position.
#'    The returned dataframe "regions" described each region with a p-value
#'    and q-value from FIMO.
#' @export
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom Rsamtools scanBam ScanBamParam
centipede_data <- function(
  bam_file, fimo_file, pvalue = 1e-4, flank_size = 100, ...
) {
  # Read the FIMO output file.
  sites <- read_fimo(fimo_file = fimo_file, pvalue = pvalue, ...)

  # Upstream flank, the center of PWM match, and downstream flank.
  sites$start <- sites$start - flank_size
  sites$stop <- sites$stop + flank_size

  # Index the BAM file if necessary.
  bam_index_file <- sprintf("%s.bai", bam_file)
  if (!file.exists(bam_index_file)) {
    message("Indexing the BAM file... this may take several minutes.")
    indexBam(bam_file, overwrite = FALSE)
  }

  # Extract reads that overlap the PWM sites.
  bam_list <- scanBam(
    file = bam_file,
    param = ScanBamParam(
      which = GRanges(
        seqnames = sites$sequence_name,
        ranges = IRanges(
          start = sites$start,
          end = sites$stop
        )
      ),
      what = c("strand", "pos", "qwidth")
    )
  )

  if (length(bam_list) == 0) {
    stop(sprintf("No reads fall in sites from '%s'\n", fimo_file))
  }

  # Convert the list of "chr:start-end" regions to a dataframe.
  regions <- lapply(names(bam_list), parse_region)
  regions <- data.frame(
    sequence_name = unlist(sapply(regions, function(x) x["chrom"])),
    start = as.numeric(unlist(sapply(regions, function(x) x["start"]))),
    stop = as.numeric(unlist(sapply(regions, function(x) x["end"])))
  )
  regions$index <- 1:nrow(regions)

  # Grab score, p_value, and q_value, but drop regions with no reads.
  regions <- merge(regions, sites)

  # Sort the regions by coordinate.
  regions <- regions[with(regions, order(sequence_name, start, stop)),]
  bam_list <- bam_list[regions$index]

  # Drop unused columns.
  regions <- regions[,!colnames(regions) %in% c("index")]

  if (nrow(regions) != length(bam_list)) {
    stop(sprintf("ERROR: %d regions and %d in bam_list. '%s'\n",
                 nrow(regions), length(bam_list), fimo_file))
  }

  mat <- do.call(rbind, lapply(seq(1, length(bam_list)), function(i) {

    # The reads overlapping this region.
    region <- parse_region(names(bam_list)[i])
    region_len <- abs(region$end - region$start) + 1

    # Exclude reads with start positions outside the region.
    item <- bam_list[[i]]

    # take care of negative reads starting at end position
    neg_shift <- item$qwidth * as.numeric(item$strand == "-")
    item$pos <- item$pos + neg_shift

    idx <- item$pos >= region$start & item$pos <= region$end
    if (sum(idx) == 0) {
      return(rep(0, 2 * region_len))
    }
    strand <- item$strand[idx]
    position <- item$pos[idx]

    # Create a row that represents the flanking region surrounding a site,
    # each column is a nucleotide position relative to the center of the motif
    # match.
    # The values in this matrix are the number of read start sites that occur
    # at that position.
    # Since we have two strands, the first half of the matrix represents the
    # positive strand and the second half represents the negative strand.
    is.neg <- as.numeric(strand == "-")
    j <- 1 + position - region$start + (region_len * is.neg)

    as.numeric(table(factor(j, levels = seq(1, 2 * region_len))))
  }))
  rownames(mat) <- names(bam_list)

  list(mat = mat, regions = regions)
}

# TODO: Create better functions for viewing CENTIPEDE results.
#
# imageReadStarts <- function(mat, site.width = 13) {
#   # Try using image() ...
#   image(t(mat), useRaster = TRUE, axes = FALSE)
#   len <- ncol(mat) / 2
#   axis(
#     side = 1,
#     at = c(
#       (len / 2 - site.width) / ncol(mat),
#       (len / 2 + site.width) / ncol(mat)
#     ),
#     labels = c("", "")
#   )
#   axis(
#     side = 1,
#     at = c(
#       (len + (len / 2 - site.width)) / ncol(mat),
#       (len + (len / 2 + site.width)) / ncol(mat)
#     ),
#     labels = c("", "")
#   )
#
#   # Also try using grid.raster() ...
#   img <- 1 - mat / max(mat)
#   w <- convertUnit(unit(ncol(img),"pt"), "in", value=TRUE)
#   h <- convertUnit(unit(nrow(img),"pt"), "in", value=TRUE)
#
#   # dev.new(width=w, height=h)
#
#   grid.raster(
#     image = img,
#     width = unit(1, "npc"),
#     height = unit(1, "npc")
#   )
#
#   v = dataViewport(xData = d$x, yData = d$y)
#   grid.points(d$x,d$y, default.units="native", vp = v,
#               gp=gpar(col="white"), pch=8)
# }
slowkow/CENTIPEDE.tutorial documentation built on May 30, 2019, 3:06 a.m.