#' Read BED formatted file to GRanges object
#'
#' @param bed_file Path Path to BED formatted file.
#' @return A GRanges object
#' @examples
#' \dontrun{
#' import_bed("myBedFile.bed")
#' }
#' @note Only imports the first 3 columns of a bed file.
#' @export
read_bed <- function(bed_file){
dat <- data.table::fread(bed_file, sep = "\t", header = FALSE)
gr <- GenomicRanges::GRanges(seqnames = dat$V1,
ranges = IRanges(start = dat$V2,
end = dat$V3))
return(gr)
}
#' Get the positions of the PWM in ATAC-seq peaks.
#'
#' @param gr A GRanges object. Usually ranges of ATAC-seq peaks.
#' @param pwm A positional weight matrix of class Matirx.
#' @param genome A BSgenome object.
#' @param min.score Minimum alignment score for pwm. Default = "85%".
#' @return A GRanges object with positions of PWM match in peaks.
#' @export
#' @importFrom magrittr %>%
#' @import GenomicRanges
#' @import IRanges
#' @examples
#' \dontrun{
#' library(BSgenome.Mmusculus.UCSC.mm10)
#' ctcf <- read.table("/data/ctcf.pwm") %>% as.matrix()
#' peaks <- read_bed("/data/atac_peaks.bed")
#' mpos <- motif_gr(peaks, ctcf, Mmusculus)
#' }
motif_gr <- function(gr, pwm, genome=Mmusculus, min.score="85%"){
if(class(gr) != "GRanges"){
stop("Object gr is not of class GRanges.")
}
if(class(pwm) != "matrix"){
stop("Object pwm is not of class matrix.")
}
if(class(genome) != "BSgenome"){
stop("Object genome is not of class BSgenome.")
}
if(class(min.score) != "character"){
stop("Object genome is not of class character.")
}
# Get the nucleotide sequences
sequences <- BSgenome::getSeq(x = genome, names=gr)
# Function to find motif in one sequence
find_motif_start <- function(x)
{
motif_starts_pos <- Biostrings::matchPWM(pwm = pwm, subject = sequences[[x]],
min.score = min.score) #%>% start()
motif_starts_neg <- Biostrings::matchPWM(pwm = reverseComplement(pwm),
subject = sequences[[x]],
min.score = min.score) %>% start()
starts_pos <- start(gr[x]) + motif_starts_pos
starts_neg <- start(gr[x]) + motif_starts_neg
starts <- c(starts_pos, starts_neg)
if (length(starts) == 0){
out <- NULL
} else {
ends <- starts + ncol(pwm)
out <- GenomicRanges::GRanges(seqnames = seqnames(gr[x]),
ranges = IRanges(start = starts,
end = ends))
}
return(out)
}
# Apply the function across all sequences
motif_ranges <- lapply(X = 1:length(sequences), FUN = find_motif_start)
# Remove the NULLs from where no motif was detected
is.NullOb <- function(x) is.null(motif_ranges[x]) | all(sapply(motif_ranges[x], is.null))
no_keep <- lapply(1:length(motif_ranges), is.NullOb) %>% unlist()
# Convert to Granges object
motif_ranges <- motif_ranges[!no_keep] %>% unlist() %>% GRangesList() %>% unlist()
# Decrease the width to motif centre
motif_ranges <- resize(motif_ranges, width = 2, fix = 'center')
# Return GRanges object
return(motif_ranges)
}
#' Get the signal from bigwig file for defined regions.
#'
#' @param bigwig Path to a bigwig file. Also accpets http paths.
#' @param gr A genomic ranges object of regions to obtain signal. All ranges should be the same width.
#' @param range The distance from the centre of gr to flank. Default=100.
#' @param log Logical. Is the signal in the bigwig file in log space? Default=FALSE.
#' If log = TRUE, the scores in the bigwig file are transformed by e^scores.
#' @return A numeric vector of aggregate signal if aggregate = TRUE, which is the default.
#' If aggregate = FALSE a data.frame of signal of all regions is returned.
#' @export
#' @import magrittr
#' @import GenomicRanges
#' @import IRanges
#' @import tidyr
#' @import rtracklayer
range_summary <- function(bigwig, gr, range=100, log=FALSE, aggregate=TRUE){
# Make all GRanges same width by fixing centre
region_gr <- GenomicRanges::resize(gr, width = range+(range+1),
fix = 'center')
# Get the signal for each base for the defined regions
# Reurns a SimpleNumericList
scores <- rtracklayer::import(con = bigwig, format = "Bigwig",
which=region_gr, as="NumericList")
# Transform scores from Log if log == TRUE
if(log == TRUE){
scores <- exp(scores)
}
# Reformat to data.frame
scores <- as.data.frame(scores)
# Set the relative postion
range_intervals <- (0-range):(0+range)
scores$position <- rep(range_intervals,
times=length(unique(scores$group)))
# Drop the group_name
scores$group_name <- NULL
# Reformat data into matrix
scores <- tidyr::spread(scores, key = position, value = value)
rownames(scores) <- scores$group
scores$group <- NULL
# Aggregate scores if aggregate=TRUE
if(aggregate == TRUE){
scores <- colSums(scores)
rel_pos <- names(scores) %>% as.character %>% as.numeric()
scores <- data.frame(position=rel_pos,
signal=scores)
rownames(scores) <- NULL
}
return(scores)
}
### Range summary for DNA methylation
range_summary_meth <- function(bigwig, gr, range=100, log=FALSE, aggregate=TRUE){
# Make all GRanges same width by fixing centre
region_gr <- GenomicRanges::resize(gr, width = range+(range+1),
fix = 'center')
# Get the signal for each base for the defined regions
# Reurns a SimpleNumericList
scores <- rtracklayer::import(con = bigwig, format = "Bigwig",
which=region_gr, as="NumericList")
# Transform scores from Log if log == TRUE
if(log == TRUE){
scores <- exp(scores)
}
# Reformat to data.frame
scores <- as.data.frame(scores)
# Set the relative postion
range_intervals <- (0-range):(0+range)
scores$position <- rep(range_intervals,
times=length(unique(scores$group)))
# Drop the group_name
scores$group_name <- NULL
# Reformat data into matrix
scores <- tidyr::spread(scores, key = position, value = value)
rownames(scores) <- scores$group
scores$group <- NULL
# Find the CG positions
library(Biostrings)
seq <- getSeq(Mmusculus, region_gr)
mf <- vmatchPattern(pattern = "CG", subject = seq)
mf <- start(mf)
mr <- vmatchPattern(pattern = "GC", subject = complement(seq))
mr <- end(mr)
# Create a matrix indicating CG postions
n_postions <- 1:ncol(scores)
is_pos_cg <- function(x){
c_pos <- c(mf[[x]], mr[[x]])
# Get TRUE/FALSE of CG position
hit <- n_postions %in% c_pos
return(hit)
}
cg_hit <- lapply(1:length(region_gr), is_pos_cg)
cg_hit <- do.call(rbind, cg_hit)
cg_hit[cg_hit == FALSE] <- NA
scores <- cg_hit + scores
scores <- scores - 1
# Aggregate scores if aggregate=TRUE
if(aggregate == TRUE){
scores <- colSums(scores)
rel_pos <- names(scores) %>% as.character %>% as.numeric()
scores <- data.frame(position=rel_pos,
signal=scores)
rownames(scores) <- NULL
}
return(scores)
}
library(magrittr)
library(GenomicRanges)
library(BSgenome.Mmusculus.UCSC.mm10)
library(rtracklayer)
library(reshape2)
library(ggplot2)
library(dplyr)
# ATAC-seq
atac_sig <- "~/Desktop/browser_files/atac_iPSC_combined_replicates.ins.bigwig"
atac_bias <- "~/Desktop/browser_files/mm10_bias.Scores.bigwig"
# NucleoATAC
nuc_sig <- "~/Desktop/browser_files/atac_iPSC_combined_replicates.nucleoatac_signal.bigwig"
occ <- "~/Desktop/browser_files/atac_iPSC_combined_replicates.occ.bigwig"
# ChIP-seq
oct <- "~/Desktop/browser_files/ips_oct_subtract.bw"
sox <- "~/Desktop/browser_files/ips_sox_subtract.bw"
# Methylation
mc <- "~/Desktop/browser_files/mmiPS_6__p1GFPpos.CG.level.unstranded.bigwig"
# Bed files
regions <- "~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_1.bed"
ctcf <- read.table("~/R_packages/RunATAC/inst/exdata/ctcf.pwm") %>% as.matrix()
regions <- read_bed(regions)
#regions <- regions[seqnames(regions) == "chr19"]
regions_ctcf <- motif_gr(regions, pwm = ctcf)
ins <- range_summary(bigwig = atac_sig, gr = regions_ctcf, range = 150, log = FALSE, aggregate = FALSE)
bias <- range_summary(bigwig = atac_bias, gr = regions_ctcf, range = 150, log = TRUE, aggregate = FALSE)
ins_over_bias <- ins / bias
nuc <- range_summary(bigwig = nuc_sig, gr = regions_ctcf, range = 500, aggregate = FALSE)
## Get the nuc occupancy class
occ <- range_summary(bigwig = occ,
gr = regions_ctcf, range = 10, log = FALSE, aggregate = FALSE)
occ_means <- rowMeans(occ)
set_occ_class <- function(x){
dat <- NULL
if (x < 0.1) {
dat <- "A"
} else if (x >= 0.1 & x < 0.3) {
dat <- "B"
} else if (x >= 0.3 & x < 0.5) {
dat <- "C"
} else if (x > 0.5) {
dat <- "D"
}
return(dat)
}
occ_class <- lapply(occ_means, set_occ_class) %>% unlist()
#### Functions fot facet plots
facet_plot_occ_class <- function(dat, occ_class, y_lab=""){
# set the occ class
dat$class <- occ_class
dat_melt <- melt(dat, id.vars = c('class'))
dat_mean <- dat_melt %>%
group_by(variable, class) %>%
summarise(yy=mean(value))
dat_mean$variable <- as.character(dat_mean$variable) %>% as.numeric()
dash <- 0
ggplot(dat_mean, aes(x = variable, y = yy, group=class)) +
geom_line() +
facet_grid(class~.) +
ylab(y_lab) +
xlab("Position relative to motif") +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text = element_text(color = 'black'),
axis.line = element_line(),
#text = element_text(size=8),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'))
}
facet_plot_occ_class_mC <- function(mc_sig, occ_class, y_lab=""){
# set the occ class
dat$class <- occ_class
dat_melt <- melt(dat, id.vars = c('class'))
# Remove NA
dat_melt <- dat_melt[complete.cases(dat_melt), ]
# dat_mean <- dat_melt %>%
# group_by(variable, class) %>%
# summarise(yy=mean(value, na.rm = TRUE))
dat_melt$variable <- as.character(dat_melt$variable) %>% as.numeric()
gg <- ggplot(dat_melt, aes(x = variable, y = value, group=class)) +
#geom_line() +
geom_smooth(colour='black', size=1) +
facet_grid(class~.) +
ylab(y_lab) +
xlab("Position relative to motif") +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.text = element_text(color = 'black'),
axis.line = element_line(),
#text = element_text(size=8),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'))
return(gg)
}
facet_plot_occ_class(dat = nuc, occ_class=1, y_lab = "Nucleosome signal")
facet_plot_occ_class(dat = ins_over_bias, occ_class=1, y_lab = "Insertions")
load(file = "~/polo_iPSC/resources/pwm_matrix_list.Rda")
# Get mofif PWM for Oct4-Sox2
os_pwm <- pwm_matrix_list$MA0142.1
regions <- ("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_1.bed")
regions <- read_bed(regions)
reg_os <- motif_gr(regions, pwm = os_pwm, min.score = "70%")
ins <- range_summary(bigwig = atac_sig, gr = reg_os, range = 150, log = FALSE, aggregate = FALSE)
bias <- range_summary(bigwig = atac_bias, gr = reg_os, range = 150, log = TRUE, aggregate = FALSE)
nuc <- range_summary(bigwig = nuc_sig, gr = reg_os, range = 500, aggregate = FALSE)
o_sig <- range_summary(bigwig = oct, gr = reg_os, range = 500, log = FALSE, aggregate = FALSE)
s_sig <- range_summary(bigwig = sox, gr = reg_os, range = 500, log = FALSE, aggregate = FALSE)
mc_sig <- range_summary_meth(bigwig = mc, gr = reg_os, range = 500, log = FALSE, aggregate = FALSE)
# Normalise insertions by bias signal
ins_over_bias <- ins / bias
## Get the nuc occupancy class
occ <- range_summary(bigwig = occ,
gr = reg_os, range = 10, log = FALSE, aggregate = FALSE)
occ_means <- rowMeans(occ)
set_occ_class <- function(x){
dat <- NULL
if (x < 0.1) {
dat <- "A"
} else if (x >= 0.1 & x < 0.3) {
dat <- "B"
} else if (x >= 0.3 & x < 0.5) {
dat <- "C"
} else if (x > 0.5) {
dat <- "D"
}
return(dat)
}
occ_class <- lapply(occ_means, set_occ_class) %>% unlist()
gg_nuc <- facet_plot_occ_class(nuc, occ_class = occ_class, y_lab = "Nucleosome signal")
gg_nuc
gg_oct <- facet_plot_occ_class(o_sig, occ_class = occ_class, y_lab = "Oct4 ChIP-seq (CPM)")
gg_sox <- facet_plot_occ_class(s_sig, occ_class = occ_class, y_lab = "Sox2 ChIP-seq (CPM)")
gg_ins <- facet_plot_occ_class(ins_over_bias, occ_class = occ_class, y_lab = "ATAC-seq insertions / insertion bias")
gg_mc <- facet_plot_occ_class_mC(mc_sig, occ_class = occ_class, y_lab = "DNA methylation")
gg_mc
library(cowplot)
pdf("~/Desktop/nucleosome_plots.pdf", width = 12.5, height = 5)
plot_grid(gg_nuc,
gg_ins,
gg_oct,
gg_sox,
gg_mc,
nrow=1, ncol=5)
dev.off()
library(pheatmap)
pdf("~/Desktop/testHeatmap.pdf")
pheatmap(mat = o_sig, cluster_rows = TRUE, cluster_cols = FALSE, scale = 'row')
dev.off()
test <- as.matrix(ins)
image(test)
##### Does nucleosome occupancy score change over time of OS in open chromatin?
# Get mofif PWM for Oct4-Sox2
load(file = "~/polo_iPSC/resources/pwm_matrix_list.Rda")
os_pwm <- pwm_matrix_list$MA0142.1
# GRanges of OS motif in open chromatin (Cluster 1)
c1_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_6.bed")
os_gr <- motif_gr(c1_gr, pwm = os_pwm, min.score = "85%")
# Bigwig of nucleoATAC occupancy scores
occ_fls <- c("/Volumes/Datasets/atac_MEF_combined_replicates.occ.bigwig",
"/Volumes/Datasets/atac_d3_combined_replicates.occ.bigwig",
"/Volumes/Datasets/atac_d6_combined_replicates.occ.bigwig",
"/Volumes/Datasets/atac_d9_combined_replicates.occ.bigwig",
"/Volumes/Datasets/atac_d12_combined_replicates.occ.bigwig",
"/Volumes/Datasets/atac_iPSC_combined_replicates.occ.bigwig")
## Get the nuc occupancy score for each
occ_scores <- lapply(occ_fls, range_summary,
gr = os_gr, range = 10, log = FALSE, aggregate = FALSE)
occ_means <- lapply(occ_scores, rowMeans)
occ_means <- do.call(cbind, occ_means)
colnames(occ_means) <- c("MEF", "d3", "d6", "d9", "d12", "iPSC")
occ_means <- melt(occ_means)
ggplot(occ_means, aes(x = Var2, y=value)) + geom_violin()
### Predict ChIP-seq peaks from ATAC-seq data
c1_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_1.bed")
c5_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_5.bed")
c7_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_7.bed")
ips_open <- c(c1_gr, c5_gr, c7_gr)
ips_open <- motif_gr(ips_open, pwm = os_pwm, min.score = "75%")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.