inst/doc/Spatial_correlation.R

## ----setup, include = FALSE-----------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
options(width=100)

## -------------------------------------------------------------------------------------------------
library(data.table)
data(ChIPreads, package="PeakSegDisk", envir=environment())
(experiments <- ChIPreads[, .(
  mean.bases=mean(chromEnd-chromStart),
  median.bases=median(chromEnd-chromStart),
  chromStart=min(chromStart)
), by=list(experiment)])

## -------------------------------------------------------------------------------------------------
end.counts <- ChIPreads[, list(
  count=.N #ignores dup reads, sum(count) would not.
), by=list(experiment, chrom, chromEnd)]
(aligned.dt <- rbind(
  ChIPreads[, .(
    bases.counted="each", experiment, chrom,
    chromStart, chromEnd,
    count=1)], #ignore duplicate reads.
  end.counts[, .(
    bases.counted="end", experiment, chrom,
    chromStart=chromEnd-1L, chromEnd,
    count)]))

## -------------------------------------------------------------------------------------------------
aligned.dt[, {
  as.list(quantile(chromEnd-chromStart))
}, by=.(bases.counted, experiment)]

## -------------------------------------------------------------------------------------------------
(seq.dt <- aligned.dt[, {
  event.dt <- rbind(
    data.table(count, pos=chromStart+1L),
    data.table(count=-count, pos=chromEnd+1L))
  edge.vec <- event.dt[, {
    as.integer(seq(min(pos), max(pos), l=100))
  }]
  event.bins <- rbind(
    event.dt,
    data.table(count=0L, pos=edge.vec))
  total.dt <- event.bins[, .(
    count=sum(count)
  ), by=list(pos)][order(pos)]
  total.dt[, cum := cumsum(count)]
  total.dt[, bin.i := cumsum(pos %in% edge.vec)]
  ## it is somewhat confusing because total.dt pos is the first base
  ## with cum, and cum goes all the way up to but not including the
  ## pos of the next row.
  total.dt[, data.table(
    chromStart=pos[-.N]-1L,
    chromEnd=pos[-1]-1L,
    count=cum[-.N],
    bin.i=bin.i[-.N])]
}, by=list(bases.counted, experiment, chrom)])

## -------------------------------------------------------------------------------------------------
if(require(ggplot2)){
gg.data <- ggplot()+
  theme_bw()+
  theme(panel.spacing=grid::unit(0, "lines"))+
  facet_grid(
    bases.counted ~ experiment,
    scales="free",
    labeller=label_both)+
  geom_step(aes(
    chromStart/1e3, count, color=data.type),
    data=data.table(seq.dt, data.type="exact"))+
  scale_color_manual(values=c(
    exact="black",
    bins="red",
    model="deepskyblue"
  ))+
  scale_x_continuous("Position on hg19 chrom (kb = kilo bases)")
print(gg.data)
}

## -------------------------------------------------------------------------------------------------
bin.dt <- seq.dt[, {
  bases <- chromEnd - chromStart
  data.table(
    binStart=min(chromStart),
    binEnd=max(chromEnd),
    mean.count=sum(count*bases)/sum(bases),
    bases=sum(bases)
  )}, by=list(bases.counted, experiment, bin.i)]
if(require(ggplot2)){
gg.bins <- gg.data+
  geom_step(aes(
    binStart/1e3, mean.count, color=data.type),
    alpha=0.75,
    linewidth=1,
    data=data.table(bin.dt, data.type="bins"))+
  scale_y_log10("Aligned DNA sequence reads (log scale)")
print(gg.bins)
}

## -------------------------------------------------------------------------------------------------
if(interactive() && requireNamespace("future"))future::plan("multisession")
segs.dt <- seq.dt[, {
  data.dir <- file.path(tempdir(), bases.counted, experiment)
  dir.create(data.dir, showWarnings=FALSE, recursive=TRUE)
  coverage.bedGraph <- file.path(data.dir, "coverage.bedGraph")
  fwrite(
    .SD[, .(chrom, chromStart, chromEnd, count)],
    coverage.bedGraph,
    sep="\t",
    quote=FALSE,
    col.names=FALSE)
  fit <- PeakSegDisk::sequentialSearch_dir(data.dir, 2L, verbose=1)
  data.table(fit$segments, data.type="model")
}, by=list(bases.counted, experiment)]
changes.dt <- segs.dt[, {
  .SD[-1]
}, by=list(bases.counted, experiment, data.type)]
if(require(ggplot2)){
gg.model <- gg.bins+
  geom_segment(aes(
    chromStart/1e3, mean,
    xend=chromEnd/1e3, yend=mean,
    color=data.type),
    data=segs.dt)+
  geom_vline(aes(
    xintercept=chromEnd/1e3,
    color=data.type),
    data=changes.dt)
print(gg.model)
}

## ---- fig.width=10--------------------------------------------------------------------------------
peaks.dt <- segs.dt[status=="peak"]
peaks.dt[, peak.i := rep(1:2, l=.N)]
peak.pos.tall <- melt(
  peaks.dt,
  measure.vars=c("chromStart", "chromEnd"))
peak.pos.wide <- dcast(
  peak.pos.tall,
  experiment + variable + peak.i ~ bases.counted)
peak.pos.wide[, diff.bases := abs(each-end)]
read.size.panel <- "each"
bases.max.dt <- seq.dt[, .(max.count=max(count)), by=list(bases.counted)]
read.size.y <- bases.max.dt[
  read.size.panel, max.count, on=list(bases.counted)]
diff.panel <- "end"
diff.y <- bases.max.dt[
  diff.panel, max.count, on=list(bases.counted)]
diff.y <- Inf
diff.vjust <- 1.1
if(require(ggplot2)){
gg.model+
  geom_text(aes(
    chromStart/1e3, read.size.y, label=sprintf(
      "Median read size:\n%.0f bases",
      median.bases)),
    hjust=0,
    vjust=1,
    data=data.table(experiments, bases.counted=read.size.panel))+
  geom_text(aes(
    end/1e3, diff.y,
    label=diff.bases,
    color=data.type),
    data=data.table(
      bases.counted=diff.panel,
      data.type="model",
      peak.pos.wide),
    vjust=diff.vjust,
    hjust=0)+
  geom_text(aes(
    chromStart/1e3, diff.y,
    label="Peak position\ndifference in bases:",
    color=data.type,
  ),
  hjust=0,
  vjust=diff.vjust,
  data=data.table(
    data.type="model",
    bases.counted=diff.panel,
    experiments["H3K36me3", on=list(experiment)]))
}

Try the PeakSegDisk package in your browser

Any scripts or data that you put into this service are public.

PeakSegDisk documentation built on Sept. 8, 2023, 5:50 p.m.