knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
#library(tidyverse) library(GenomicRanges) library(rtracklayer) library(org.Hs.eg.db) library(TxDb.Hsapiens.UCSC.hg38.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene library(stringr) library(plyranges) #BiocManager::install("biomaRt") #BiocManager::install("ComplexHeatmap") library(ComplexHeatmap) #BiocManager::install("reader") # to remove extension from bw files names --> only installed on my own mac #library(reader)
the raw data are removed from the tutorial package and can be found on blueprint dataportal and on cn45:
cat $file | awk '{OFS = "\t"} {if ($1 == "chr19") print $0}' > $(basename $file .bed).chr19.bed
/ceph/rimlsfnwi/web_share/mbdata/ctoenhake/edu/fg1_r_data/blueprint/chip_bed
# H3k4me1 monocytes_h3k4me1 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547981.H3K4me1.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak") # H3K4me3 monocytes_h3k4me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547984.H3K4me3.bwa.GRCh38.20150527.chr19.bed", format = "narrowPeak") # H3K9me3 monocytes_h3k9me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547982.H3K9me3.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak") # H3K27ac monocytes_h3k27ac <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547980.H3K27ac.bwa.GRCh38.20150527.chr19.bed", format = "narrowPeak") # H3K27me3 monocytes_h3k27me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547983.H3K27me3.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak") # H3K36me3 monocytes_h3k36me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547979.H3K36me3.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak") # create list monocytes_list <- GRangesList(monocytes_h3k4me1, monocytes_h3k4me3, monocytes_h3k9me3, monocytes_h3k27ac, monocytes_h3k27me3, monocytes_h3k36me3) # add names to each element in the list names(monocytes_list) <- c("h3k4me1", "h3k4me3", "h3k9me3", "h3k27ac", "h3k27me3", "h3k36me3")
save(monocytes_h3k4me1, monocytes_h3k4me3, monocytes_h3k9me3, monocytes_h3k27ac, monocytes_h3k27me3, monocytes_h3k36me3, monocytes_list, file = "data/prepared_rds/blueprint_monocyte_chr19_granges.RData")
restrict to genes on chromosome 19
File "C000S5B1.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results"
/ceph/rimlsfnwi/web_share/mbdata/ctoenhake/edu/fg1_r_data/blueprint/quant/C000S5B1.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results
. #BiocManager::install("org.Hs.eg.db") library(org.Hs.eg.db) all_results <- read.table("data/blueprint/rna_quantification/C000S5B1.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results", header =T) # remove version from ensemble gene id all_results$gene_id <- sapply(all_results$gene_id, sub, pattern = "\\.\\d+$", replacement = "") # map entrez id to ensemble id all_results$entrezgene_id = mapIds(org.Hs.eg.db, keys=all_results$gene_id, column="ENTREZID", keytype="ENSEMBL", multiVals="first") # vector of entrez ids for genes on chr19 genes_chr19 <- genes(txdb) geneids_genes_chr19 <- mcols(genes_chr19)[,1] # subset all_results for genes on chr19 quantification_chr19 <- all_results[all_results$entrezgene_id %in% geneids_genes_chr19,c("entrezgene_id", "TPM", "FPKM")]
# write quantification_chr19 to rds saveRDS(quantification_chr19, file = "data/prepared_rds/blueprint_c000s5_gene_quantification_chr19.rds")
Thus, 'situation after the fg2 experience': need to reduce the size of all objects!
/scratch/ctoenhake/data/fg1_r_data/blueprint/chip_bw/select_chr19.sh
/ceph/rimlsfnwi/web_share/mbdata/ctoenhake/edu/fg1_r_data/blueprint/chip_bw
/home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19
After I ran the chunks to get smaller bigwig files, I removed the dir /home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19
, and kept /home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19_tss_sample
# Important parameter: the chosen proportion of genes --> replaced by ngenes to reduce the filesize further. # chosen_proportion <- 0.4 # Important parameter: number of genes that will be used by students for analyses ngenes <- 60
### set sqlevels in txdb to chr19 library(TxDb.Hsapiens.UCSC.hg38.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene seqlevels(txdb) <- seqlevels0(txdb) seqlevels(txdb) <- "chr19" # load chr19 quantification (used in fg2) rdsfile <- system.file("extdata", "week2", "prepared_rds", "blueprint_c000s5_gene_quantification_chr19.rds", package = "learnr.proto") quantification_chr19 <- readRDS(rdsfile) rm(rdsfile)
# define new variable based on quartiles of gene expression quantification_chr19 <- quantification_chr19 %>% mutate(expression_group = ntile(FPKM, 4)) ### set seed for slice_sample set.seed(10) # subsample per quartile # quantification_chr19_sample <- quantification_chr19 %>% # group_by(expression_group) %>% # slice_sample(prop=chosen_proportion, replace = F) quantification_chr19_60genes <- quantification_chr19 %>% group_by(expression_group) %>% slice_sample(n=ngenes/4, replace = F) # check number of genes per quantile and in expression plot table(quantification_chr19_60genes$expression_group) ggplot(quantification_chr19, aes(x = as.factor(expression_group), y = log10(FPKM+1)))+ geom_boxplot()+ geom_jitter() ggplot(quantification_chr19_60genes, aes(x = as.factor(expression_group), y = log10(FPKM+1)))+ geom_boxplot()+ geom_jitter()
# sample bigwig for regions around the TSS of the sampled genes on chr19 ## make ref tss files: tss_chr19 <- unique(promoters(genes(txdb), upstream = 2000, downstream = 0 )) tss_chr19 <- resize(tss_chr19, width = 1, fix = "end") tss_chr19$name <- tss_chr19$gene_id tss_chr19_60genes <- tss_chr19 %>% filter(gene_id %in% quantification_chr19_60genes$entrezgene_id ) all(quantification_chr19_sample$entrezgene_id %in% tss_chr19_60genes$gene_id) ### input dir: dir_raw_files <- "/home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19/" ### output dir: dir_out_files <- "/home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19_tss_sample/" ### function to select regions in bigwig files and write output in outdir bw_files <- dir(path = dir_raw_files, pattern = "*.bw") resize_bw <- function(w_in_kb, f, tss){ print(deparse(substitute(f))) w_in_bp <- w_in_kb*1000+4 tss_window <- resize(tss, width = w_in_bp, fix = "center") sum(width(tss_window)) bw_tss_window <- import(paste(dir_raw_files, f, sep = "/"), which = tss_window, as = "RleList") bw_out <- file.path(dir_out_files, paste0(tools::file_path_sans_ext(f), "_chr19tss_", ngenes, "Genes_", w_in_kb, "kb.bw")) export.bw(object = bw_tss_window, bw_out) rm(tss_window, bw_tss_window, bw_out) } ## perform function on all files in input directory. for(file in bw_files){ resize_bw(10, file, tss_chr19_60genes) }
# make quantification_chr19_60genes (generated in chunk above) ready for saving quantification_chr19_60genes$expression_group <- NULL quantification_chr19_60genes_save <- quantification_chr19_60genes quantification_chr19_60genes_save <- quantification_chr19_60genes_save %>% dplyr::rename(gene_id = entrezgene_id) quantification_chr19_60genes <- quantification_chr19_60genes_save rm(quantification_chr19_60genes_save) # make tss_chr19_60genes (generated in chunk above) ready for saving, thus add FPKM to TSS windows such that these intervals can be arranged by FPKM FPKM_tssorder<- as.vector(NA) for(i in 1:length(tss_chr19_60genes)) { FPKM_tssorder[i] <- quantification_chr19_60genes[which(quantification_chr19_60genes[, "gene_id"] == tss_chr19_60genes$gene_id[i]),]$FPKM } tss_chr19_60genes$FPKM <- FPKM_tssorder rm(FPKM_tssorder) # load week2 data and sample H3K4me3 peaks in granges objects to 100 peaks of the complete dataset rdata = system.file("extdata", "week2.Rdata", package = "learnr.proto") load(rdata) monocytes_h3k4me3_metadata <- as.data.frame(mcols(monocytes_h3k4me3)) %>% slice_sample(n = 100, replace = F) nrow(monocytes_h3k4me3_metadata) monocytes_h3k4me3_sample <- monocytes_h3k4me3 %>% plyranges::filter(name %in% monocytes_h3k4me3_metadata$name) # also subset tss_chr19 for the TSSs in this sample dist_h3k4me3_to_tss <- distance2NearestFeature(monocytes_h3k4me3_sample, tss_chr19) tss_chr19_ex1 <- tss_chr19 %>% plyranges::filter(gene_id %in% dist_h3k4me3_to_tss$feature.name) ## save granges objects of peaks and sampled/sliced quantification and TSSs save(monocytes_h3k4me3_sample, tss_chr19_ex1, tss_chr19_60genes, file = "/home/ctoenhake/learnr.proto/inst/extdata/week3.Rdata")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.