testing/normalise_atac_footprints.R

#' Read BED formatted file to GRanges object
#' 
#' @param bed_file Path Path to BED formatted file.
#' @return A GRanges object
#' @examples
#' \dontrun{
#' import_bed("myBedFile.bed")
#' }
#' @note Only imports the first 3 columns of a bed file.
#' @export
read_bed <- function(bed_file){
        dat <- data.table::fread(bed_file, sep = "\t", header = FALSE)
        gr <- GenomicRanges::GRanges(seqnames = dat$V1,
                                     ranges = IRanges(start = dat$V2,
                                                      end = dat$V3))
        return(gr)
}

#' Get the positions of the PWM in ATAC-seq peaks.
#' 
#' @param gr A GRanges object. Usually ranges of ATAC-seq peaks. 
#' @param pwm A positional weight matrix of class Matirx.
#' @param genome A BSgenome object. 
#' @param min.score Minimum alignment score for pwm. Default = "85%".
#' @return A GRanges object with positions of PWM match in peaks.
#' @export
#' @importFrom magrittr %>%
#' @import GenomicRanges
#' @import IRanges
#' @examples
#' \dontrun{
#' library(BSgenome.Mmusculus.UCSC.mm10)
#' ctcf <- read.table("/data/ctcf.pwm") %>% as.matrix()
#' peaks <- read_bed("/data/atac_peaks.bed")
#' mpos <- motif_gr(peaks, ctcf, Mmusculus)
#' }
motif_gr <- function(gr, pwm, genome=Mmusculus, min.score="85%"){
        
        if(class(gr) != "GRanges"){
                stop("Object gr is not of class GRanges.")
        }
        if(class(pwm) != "matrix"){
                stop("Object pwm is not of class matrix.")
        }
        if(class(genome) != "BSgenome"){
                stop("Object genome is not of class BSgenome.")
        }
        if(class(min.score) != "character"){
                stop("Object genome is not of class character.")
        }
        
        # Get the nucleotide sequences
        sequences <- BSgenome::getSeq(x = genome, names=gr)
        
        # Function to find motif in one sequence
        find_motif_start <- function(x)
        {
                motif_starts_pos <- Biostrings::matchPWM(pwm = pwm, subject = sequences[[x]],
                                                     min.score = min.score) #%>% start()
                
        
                motif_starts_neg <- Biostrings::matchPWM(pwm = reverseComplement(pwm),
                                                         subject = sequences[[x]],
                                                         min.score = min.score) %>% start()
                
                starts_pos <- start(gr[x]) + motif_starts_pos
                starts_neg <- start(gr[x]) + motif_starts_neg
                starts <- c(starts_pos, starts_neg)
                
                if (length(starts) == 0){
                        out <- NULL
                } else {
                        ends <- starts + ncol(pwm)
                        out <- GenomicRanges::GRanges(seqnames = seqnames(gr[x]),
                                                      ranges = IRanges(start = starts,
                                                                       end = ends))
                }
                
                return(out)
        }
        
        # Apply the function across all sequences
        motif_ranges <- lapply(X = 1:length(sequences), FUN = find_motif_start)
        
        # Remove the NULLs from where no motif was detected
        is.NullOb <- function(x) is.null(motif_ranges[x]) | all(sapply(motif_ranges[x], is.null))
        no_keep <- lapply(1:length(motif_ranges), is.NullOb) %>% unlist() 
        
        # Convert to Granges object
        motif_ranges <- motif_ranges[!no_keep] %>% unlist() %>% GRangesList() %>% unlist()
        
        # Decrease the width to motif centre
        motif_ranges <- resize(motif_ranges, width = 2, fix = 'center')
        
        # Return GRanges object
        return(motif_ranges)
}


#' Get the signal from bigwig file for defined regions.
#' 
#' @param bigwig Path to a bigwig file. Also accpets http paths. 
#' @param gr A genomic ranges object of regions to obtain signal. All ranges should be the same width.
#' @param range The distance from the centre of gr to flank. Default=100. 
#' @param log Logical. Is the signal in the bigwig file in log space? Default=FALSE.
#' If log = TRUE, the scores in the bigwig file are transformed by e^scores.
#' @return A numeric vector of aggregate signal if aggregate = TRUE, which is the default. 
#' If aggregate = FALSE a data.frame of signal of all regions is returned.
#' @export
#' @import magrittr
#' @import GenomicRanges
#' @import IRanges
#' @import tidyr
#' @import rtracklayer
range_summary <- function(bigwig, gr, range=100, log=FALSE, aggregate=TRUE){
        
        # Make all GRanges same width by fixing centre
        region_gr <- GenomicRanges::resize(gr, width = range+(range+1), 
                                           fix = 'center')

        # Get the signal for each base for the defined regions
        # Reurns a SimpleNumericList
        scores <- rtracklayer::import(con = bigwig, format = "Bigwig",
                                      which=region_gr, as="NumericList")
        
        # Transform scores from Log if log == TRUE
        if(log == TRUE){
                scores <- exp(scores)
        }
        
        # Reformat to data.frame
        scores <- as.data.frame(scores)
        
        # Set the relative postion
        range_intervals <- (0-range):(0+range)
        scores$position <- rep(range_intervals,
                               times=length(unique(scores$group)))
        
        # Drop the group_name
        scores$group_name <- NULL
        
        # Reformat data into matrix
        scores <- tidyr::spread(scores, key = position, value = value)
        rownames(scores) <- scores$group
        scores$group <- NULL
        
        # Aggregate scores if aggregate=TRUE
        if(aggregate == TRUE){
                scores <- colSums(scores)
                rel_pos <- names(scores) %>% as.character %>% as.numeric()
                scores <- data.frame(position=rel_pos,
                                     signal=scores)
                rownames(scores) <- NULL
        }
        
        return(scores)
}


### Range summary for DNA methylation
range_summary_meth <- function(bigwig, gr, range=100, log=FALSE, aggregate=TRUE){
        
        # Make all GRanges same width by fixing centre
        region_gr <- GenomicRanges::resize(gr, width = range+(range+1), 
                                           fix = 'center')
        
        # Get the signal for each base for the defined regions
        # Reurns a SimpleNumericList
        
        scores <- rtracklayer::import(con = bigwig, format = "Bigwig",
                                      which=region_gr, as="NumericList")
        
        # Transform scores from Log if log == TRUE
        if(log == TRUE){
                scores <- exp(scores)
        }
        
        # Reformat to data.frame
        scores <- as.data.frame(scores)
        
        # Set the relative postion
        range_intervals <- (0-range):(0+range)
        scores$position <- rep(range_intervals,
                               times=length(unique(scores$group)))
        
        # Drop the group_name
        scores$group_name <- NULL
        
        # Reformat data into matrix
        scores <- tidyr::spread(scores, key = position, value = value)
        rownames(scores) <- scores$group
        scores$group <- NULL
        
        
        # Find the CG positions
        library(Biostrings)
        seq <- getSeq(Mmusculus, region_gr)
        
        mf <- vmatchPattern(pattern = "CG", subject = seq)
        mf <- start(mf)
        
        mr <- vmatchPattern(pattern = "GC", subject = complement(seq))
        mr <- end(mr)
        
        # Create a matrix indicating CG postions
        n_postions <- 1:ncol(scores)
        
        is_pos_cg <- function(x){
                c_pos <- c(mf[[x]], mr[[x]])
                
                # Get TRUE/FALSE of CG position
                hit <- n_postions %in% c_pos
                
                return(hit)
        } 
        
        cg_hit <- lapply(1:length(region_gr), is_pos_cg)
        cg_hit <- do.call(rbind, cg_hit)
        
        cg_hit[cg_hit ==  FALSE] <- NA
        
        scores <- cg_hit + scores
        scores <- scores - 1
        
        # Aggregate scores if aggregate=TRUE
        if(aggregate == TRUE){
                scores <- colSums(scores)
                rel_pos <- names(scores) %>% as.character %>% as.numeric()
                scores <- data.frame(position=rel_pos,
                                     signal=scores)
                rownames(scores) <- NULL
        }
        
        return(scores)
}


library(magrittr)
library(GenomicRanges)
library(BSgenome.Mmusculus.UCSC.mm10)
library(rtracklayer)
library(reshape2)
library(ggplot2)
library(dplyr)

# ATAC-seq
atac_sig  <- "~/Desktop/browser_files/atac_iPSC_combined_replicates.ins.bigwig"
atac_bias <- "~/Desktop/browser_files/mm10_bias.Scores.bigwig"

# NucleoATAC
nuc_sig <- "~/Desktop/browser_files/atac_iPSC_combined_replicates.nucleoatac_signal.bigwig"
occ <- "~/Desktop/browser_files/atac_iPSC_combined_replicates.occ.bigwig"

# ChIP-seq
oct <- "~/Desktop/browser_files/ips_oct_subtract.bw"
sox <- "~/Desktop/browser_files/ips_sox_subtract.bw"

# Methylation
mc <- "~/Desktop/browser_files/mmiPS_6__p1GFPpos.CG.level.unstranded.bigwig"

# Bed files
regions <- "~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_1.bed"
ctcf <- read.table("~/R_packages/RunATAC/inst/exdata/ctcf.pwm") %>% as.matrix()

regions <- read_bed(regions) 
#regions <- regions[seqnames(regions) == "chr19"]
regions_ctcf <-  motif_gr(regions, pwm = ctcf)

ins <- range_summary(bigwig = atac_sig, gr = regions_ctcf, range = 150, log = FALSE, aggregate = FALSE)
bias <- range_summary(bigwig = atac_bias, gr = regions_ctcf, range = 150, log = TRUE, aggregate = FALSE)
ins_over_bias <- ins / bias

nuc <- range_summary(bigwig = nuc_sig, gr = regions_ctcf, range = 500, aggregate = FALSE)

## Get the nuc occupancy class
occ <- range_summary(bigwig = occ,
                     gr = regions_ctcf, range = 10, log = FALSE, aggregate = FALSE)

occ_means <- rowMeans(occ)

set_occ_class <- function(x){
        
        dat <- NULL
        
        if (x < 0.1) {
                dat <- "A"
        } else if (x >= 0.1 & x < 0.3) {
                dat <- "B" 
        } else if (x >= 0.3 & x < 0.5) {
                dat <- "C"
        } else if (x > 0.5) { 
                dat <- "D"
        }
        
        return(dat)
}

occ_class <- lapply(occ_means, set_occ_class) %>% unlist()




#### Functions fot facet plots
facet_plot_occ_class <- function(dat, occ_class, y_lab=""){
        
        # set the occ class
        dat$class <- occ_class
        
        dat_melt <- melt(dat, id.vars = c('class'))
        
        dat_mean <- dat_melt %>%
                group_by(variable, class) %>%
                summarise(yy=mean(value))
        
        dat_mean$variable <- as.character(dat_mean$variable) %>% as.numeric()
        dash <- 0
        ggplot(dat_mean, aes(x = variable, y = yy, group=class)) +
                geom_line() +
                facet_grid(class~.) +
                ylab(y_lab) +
                xlab("Position relative to motif") +
                theme_bw() +
                theme(plot.background = element_blank(),
                      panel.grid.minor = element_blank(),
                      panel.border = element_blank(),
                      axis.text = element_text(color = 'black'),
                      axis.line = element_line(),
                      #text = element_text(size=8),
                      axis.line.x = element_line(color = 'black'),
                      axis.line.y = element_line(color = 'black'))
}
facet_plot_occ_class_mC <- function(mc_sig, occ_class, y_lab=""){
        
        # set the occ class
        dat$class <- occ_class
        
        dat_melt <- melt(dat, id.vars = c('class'))
        
        # Remove NA
        dat_melt <- dat_melt[complete.cases(dat_melt), ]
        
        #         dat_mean <- dat_melt %>%
        #                 group_by(variable, class) %>%
        #                 summarise(yy=mean(value, na.rm = TRUE))
        
        dat_melt$variable <- as.character(dat_melt$variable) %>% as.numeric()
        
        gg <- ggplot(dat_melt, aes(x = variable, y = value, group=class)) +
                #geom_line() +
                geom_smooth(colour='black', size=1) +
                facet_grid(class~.) +
                ylab(y_lab) +
                xlab("Position relative to motif") +
                theme_bw() +
                theme(plot.background = element_blank(),
                      panel.grid.minor = element_blank(),
                      panel.border = element_blank(),
                      axis.text = element_text(color = 'black'),
                      axis.line = element_line(),
                      #text = element_text(size=8),
                      axis.line.x = element_line(color = 'black'),
                      axis.line.y = element_line(color = 'black'))
        
        return(gg)
}

facet_plot_occ_class(dat = nuc, occ_class=1, y_lab = "Nucleosome signal")
facet_plot_occ_class(dat = ins_over_bias, occ_class=1, y_lab = "Insertions")




load(file = "~/polo_iPSC/resources/pwm_matrix_list.Rda")
# Get mofif PWM for Oct4-Sox2
os_pwm <- pwm_matrix_list$MA0142.1

regions <- ("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_1.bed")
regions <- read_bed(regions)
reg_os <-  motif_gr(regions, pwm = os_pwm, min.score = "70%")

ins <- range_summary(bigwig = atac_sig, gr = reg_os, range = 150, log = FALSE, aggregate = FALSE)
bias <- range_summary(bigwig = atac_bias, gr = reg_os, range = 150, log = TRUE, aggregate = FALSE)
nuc <- range_summary(bigwig = nuc_sig, gr = reg_os, range = 500, aggregate = FALSE)
o_sig <- range_summary(bigwig = oct, gr = reg_os, range = 500, log = FALSE, aggregate = FALSE)
s_sig <- range_summary(bigwig = sox, gr = reg_os, range = 500, log = FALSE, aggregate = FALSE)

mc_sig <- range_summary_meth(bigwig = mc, gr = reg_os, range = 500, log = FALSE, aggregate = FALSE)






# Normalise insertions by bias signal
ins_over_bias <- ins  / bias

## Get the nuc occupancy class
occ <- range_summary(bigwig = occ,
                     gr = reg_os, range = 10, log = FALSE, aggregate = FALSE)

occ_means <- rowMeans(occ)

set_occ_class <- function(x){
        
        dat <- NULL
        
        if (x < 0.1) {
                dat <- "A"
        } else if (x >= 0.1 & x < 0.3) {
                dat <- "B" 
        } else if (x >= 0.3 & x < 0.5) {
                dat <- "C"
        } else if (x > 0.5) { 
                dat <- "D"
        }
        
        return(dat)
}

occ_class <- lapply(occ_means, set_occ_class) %>% unlist()








gg_nuc <- facet_plot_occ_class(nuc, occ_class = occ_class, y_lab = "Nucleosome signal")
gg_nuc
gg_oct <- facet_plot_occ_class(o_sig, occ_class = occ_class, y_lab = "Oct4 ChIP-seq (CPM)")
gg_sox <- facet_plot_occ_class(s_sig, occ_class = occ_class, y_lab = "Sox2 ChIP-seq (CPM)")
gg_ins <- facet_plot_occ_class(ins_over_bias, occ_class = occ_class, y_lab = "ATAC-seq insertions / insertion bias")
gg_mc <- facet_plot_occ_class_mC(mc_sig, occ_class = occ_class, y_lab = "DNA methylation")
gg_mc
library(cowplot)

pdf("~/Desktop/nucleosome_plots.pdf", width = 12.5, height = 5)
plot_grid(gg_nuc, 
          gg_ins, 
          gg_oct,
          gg_sox,
          gg_mc,
          nrow=1, ncol=5)
dev.off()


library(pheatmap)

pdf("~/Desktop/testHeatmap.pdf")
pheatmap(mat = o_sig, cluster_rows = TRUE, cluster_cols = FALSE, scale = 'row')
dev.off()

test <- as.matrix(ins)
image(test)






##### Does nucleosome occupancy score change over time of OS in open chromatin?

# Get mofif PWM for Oct4-Sox2
load(file = "~/polo_iPSC/resources/pwm_matrix_list.Rda")
os_pwm <- pwm_matrix_list$MA0142.1

# GRanges of OS motif in open chromatin (Cluster 1)
c1_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_6.bed")
os_gr <-  motif_gr(c1_gr, pwm = os_pwm, min.score = "85%")

# Bigwig of nucleoATAC occupancy scores
occ_fls <- c("/Volumes/Datasets/atac_MEF_combined_replicates.occ.bigwig",
        "/Volumes/Datasets/atac_d3_combined_replicates.occ.bigwig",
        "/Volumes/Datasets/atac_d6_combined_replicates.occ.bigwig",
        "/Volumes/Datasets/atac_d9_combined_replicates.occ.bigwig",
        "/Volumes/Datasets/atac_d12_combined_replicates.occ.bigwig",
        "/Volumes/Datasets/atac_iPSC_combined_replicates.occ.bigwig")

## Get the nuc occupancy score for each 

occ_scores <- lapply(occ_fls, range_summary, 
                     gr = os_gr, range = 10, log = FALSE, aggregate = FALSE)

occ_means <- lapply(occ_scores, rowMeans)

occ_means <- do.call(cbind, occ_means)
colnames(occ_means) <- c("MEF", "d3", "d6", "d9", "d12", "iPSC")

occ_means <- melt(occ_means)

ggplot(occ_means, aes(x = Var2, y=value)) + geom_violin()



### Predict ChIP-seq peaks from ATAC-seq data

c1_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_1.bed")
c5_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_5.bed")
c7_gr <- read_bed("~/polo_iPSC/ATACseq/processed_data/atac_cluster_peaks/c_means_peaks/cluster_7.bed")

ips_open <- c(c1_gr, c5_gr, c7_gr)
ips_open <-  motif_gr(ips_open, pwm = os_pwm, min.score = "75%")
SamBuckberry/RunATAC documentation built on Aug. 2, 2021, 9:54 a.m.