In this analysis we are going to calculate correlations of the PARP1 reads with the methylated reads from UCSC/ENCODE. The PARP1 reads and methylation read data has already been processed and is made available as part of the fmdatabreastcaparp1 package.
knitr::opts_chunk$set(dev='png')
library(GenomicRanges) library(ggplot2) library(fmcorrelationbreastcaparp1) library(fmdatabreastcaparp1) library(BSgenome.Hsapiens.UCSC.hg19)
data(parp1_ln4_unique) data(parp1_ln5_unique) data(methyl_rep1) data(methyl_rep2) data(tss_windows)
Calculate the weighted coverage of the ln4 and ln5 sample reads, and then sum the reads in each TSS window.
mcf7_cov <- coverage(parp1_ln4_unique, weight = "n_count") mdamb231_cov <- coverage(parp1_ln5_unique, weight = "n_count") tss_windows <- binned_function(tss_windows, mcf7_cov, "sum", "parp1_mcf7") tss_windows <- binned_function(tss_windows, mdamb231_cov, "sum", "parp1_mdamb231")
With the methylation data we want to generate a true value for the amount of methylation covering a read, by multiplying the readCount
by the percentMeth
, we should generate the methylation reads only, which is what we are interested in.
mcols(methyl_rep1)$methyl_read <- mcols(methyl_rep1)$mcols.readCount * mcols(methyl_rep1)$mcols.percentMeth / 100 mcols(methyl_rep2)$methyl_read <- mcols(methyl_rep2)$mcols.readCount * mcols(methyl_rep2)$mcols.percentMeth / 100 methyl_r1_cov <- coverage(methyl_rep1, weight = "methyl_read") methyl_r2_cov <- coverage(methyl_rep2, weight = "methyl_read") tss_windows <- binned_function(tss_windows, methyl_r1_cov, "sum", "methyl_r1") tss_windows <- binned_function(tss_windows, methyl_r2_cov, "sum", "methyl_r2")
Now with the Parp1 reads and methylation reads, we can start doing some correlations.
non_zero <- "both"
Here we do a subsampling visualization and correlation to see how things look:
r1_v_mcf7 <- subsample_nonzeros(mcols(tss_windows), c("methyl_r1", "parp1_mcf7"), non_zero = non_zero, n_points = 10000) ggplot(r1_v_mcf7, aes(x = methyl_r1, y = parp1_mcf7)) + geom_point() + scale_y_log10() + scale_x_log10() cor(log10(r1_v_mcf7[,1]+1), log10(r1_v_mcf7[,2]+1)) r2_v_mcf7 <- subsample_nonzeros(mcols(tss_windows), c("methyl_r2", "parp1_mcf7"), non_zero = non_zero, n_points = 10000) ggplot(r2_v_mcf7, aes(x = methyl_r2, y = parp1_mcf7)) + geom_point() + scale_y_log10() + scale_x_log10() cor(log10(r2_v_mcf7[,1]+1), log10(r2_v_mcf7[,2]+1))
And now use all of the non-zero points in both:
all_comb <- expand.grid(c("methyl_r1", "methyl_r2"), c("parp1_mcf7", "parp1_mdamb231"), stringsAsFactors = FALSE) out_cor <- lapply(seq(1, nrow(all_comb)), function(i_row){ #print(i_row) correlate_non_zero(mcols(tss_windows), as.character(all_comb[i_row,]), log_transform = TRUE, non_zero = non_zero, test = TRUE) }) all_comb_names <- paste(all_comb[,1], all_comb[,2], sep = "_v_") out_cor <- do.call(rbind, out_cor) rownames(out_cor) <- all_comb_names
TSS correlations:
knitr::kable(out_cor)
And generate all the graphs.
out_graphs <- lapply(seq(1, nrow(all_comb)), function(i_row){ use_vars <- as.character(all_comb[i_row,]) subpoints <- subsample_nonzeros(mcols(tss_windows), use_vars, non_zero = non_zero, n_points = 10000) ggplot(subpoints, aes_string(x = use_vars[1], y = use_vars[2])) + geom_point() + scale_y_log10() + scale_x_log10() }) out_graphs
Are these correlations a result of association with the TSS's? One way to test this is to set up a calculation genome-wide.
genome_tiles <- tileGenome(seqinfo(Hsapiens), tilewidth = 2000, cut.last.tile.in.chrom = TRUE) genome_tiles <- binned_function(genome_tiles, mcf7_cov, "sum", "parp1_mcf7") genome_tiles <- binned_function(genome_tiles, mdamb231_cov, "sum", "parp1_mdamb231") genome_tiles <- binned_function(genome_tiles, methyl_r1_cov, "sum", "methyl_r1") genome_tiles <- binned_function(genome_tiles, methyl_r2_cov, "sum", "methyl_r2")
genome_r1_v_mcf7 <- subsample_nonzeros(mcols(genome_tiles), c("methyl_r1", "parp1_mcf7"), non_zero = non_zero, n_points = 10000) ggplot(genome_r1_v_mcf7, aes(x = methyl_r1, y = parp1_mcf7)) + scale_x_log10() + scale_y_log10() + geom_point() cor(log(genome_r1_v_mcf7[,1]+1), log(genome_r1_v_mcf7[,2]+1))
genome_cor <- lapply(seq(1, nrow(all_comb)), function(i_row){ #print(i_row) correlate_non_zero(mcols(genome_tiles), as.character(all_comb[i_row,]), log_transform = TRUE, non_zero = non_zero, test = TRUE) }) all_comb_names <- paste(all_comb[,1], all_comb[,2], sep = "_v_") genome_cor <- do.call(rbind, genome_cor) rownames(genome_cor) <- all_comb_names
Genome wide correlations:
knitr::kable(genome_cor)
We will save the correlation results in some plain text files.
saveloc <- "../inst/correlation_tables" write.table(out_cor, file = file.path(saveloc, "methylation_tss.txt"), sep = "\t") write.table(genome_cor, file = file.path(saveloc, "methylation_genome.txt"), sep = "\t")
Sys.time() sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.