In vanheeringen-lab/learnr.proto: A set of LearnR Tutorials.

Last updated 2021/02/06

knitr::opts_chunk$set(echo = TRUE, eval=FALSE)

Load libraries

#library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene 
library(stringr)
library(plyranges)
#BiocManager::install("biomaRt")
#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
#BiocManager::install("reader") # to remove extension from bw files names --> only installed on my own mac
#library(reader)

week 2

chr19 peaks in GRanges

the raw data are removed from the tutorial package and can be found on blueprint dataportal and on cn45:

bed files were downloaded from, eg.H3K4me3: http://dcc.blueprint-epigenome.eu/#/experiments/ERX547984 ; file "Enriched regions, BED format'.
select chr19 with awk: cat $file | awk '{OFS = "\t"} {if ($1 == "chr19") print $0}' > $(basename $file .bed).chr19.bed
complete and chr19 limited .bed files are stored on cn45: /ceph/rimlsfnwi/web_share/mbdata/ctoenhake/edu/fg1_r_data/blueprint/chip_bed

# H3k4me1 
monocytes_h3k4me1 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547981.H3K4me1.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak")
# H3K4me3  
monocytes_h3k4me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547984.H3K4me3.bwa.GRCh38.20150527.chr19.bed", format = "narrowPeak")
# H3K9me3 
monocytes_h3k9me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547982.H3K9me3.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak")
# H3K27ac 
monocytes_h3k27ac <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547980.H3K27ac.bwa.GRCh38.20150527.chr19.bed", format = "narrowPeak")
# H3K27me3   
monocytes_h3k27me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547983.H3K27me3.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak")
# H3K36me3
monocytes_h3k36me3 <- rtracklayer::import("/home/ctoenhake/learnr.proto/inst/extdata/week2/blueprint/chip_bed/C000S5H2.ERX547979.H3K36me3.bwa.GRCh38.broad.20150527.chr19.bed", format = "broadPeak")
# create list
monocytes_list  <- GRangesList(monocytes_h3k4me1, monocytes_h3k4me3, monocytes_h3k9me3, monocytes_h3k27ac, monocytes_h3k27me3, monocytes_h3k36me3)
# add names to each element in the list
names(monocytes_list) <- c("h3k4me1", "h3k4me3", "h3k9me3", "h3k27ac", "h3k27me3", "h3k36me3")

save(monocytes_h3k4me1, monocytes_h3k4me3, monocytes_h3k9me3, monocytes_h3k27ac, monocytes_h3k27me3, monocytes_h3k36me3, monocytes_list,  file =  "data/prepared_rds/blueprint_monocyte_chr19_granges.RData")

chr19 BLUEPRINT monocytes transcript quantification

add entrez gene id using org.Hs.eg.db package
restrict to genes on chromosome 19
File "C000S5B1.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results"
was downloaded from: http://dcc.blueprint-epigenome.eu/#/experiments/ERX157053 ; file "Transcription quantification (Genes)".
is stored on cn45: /ceph/rimlsfnwi/web_share/mbdata/ctoenhake/edu/fg1_r_data/blueprint/quant/C000S5B1.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results.

#BiocManager::install("org.Hs.eg.db")
library(org.Hs.eg.db)

all_results <- read.table("data/blueprint/rna_quantification/C000S5B1.gene_quantification.rsem_grape2_crg.GRCh38.20150622.results", header =T)

# remove version from ensemble gene id 
all_results$gene_id <- sapply(all_results$gene_id,  sub, pattern = "\\.\\d+$", replacement = "")

# map entrez id to ensemble id
all_results$entrezgene_id = mapIds(org.Hs.eg.db,
                    keys=all_results$gene_id, 
                    column="ENTREZID",
                    keytype="ENSEMBL",
                    multiVals="first")

# vector of entrez ids for genes on chr19 
genes_chr19 <- genes(txdb)
geneids_genes_chr19 <- mcols(genes_chr19)[,1]

# subset all_results for genes on chr19
quantification_chr19 <- all_results[all_results$entrezgene_id %in% geneids_genes_chr19,c("entrezgene_id", "TPM", "FPKM")]

# write quantification_chr19 to rds
saveRDS(quantification_chr19, file = "data/prepared_rds/blueprint_c000s5_gene_quantification_chr19.rds")

week 3

Situation: need bed and bigWig files for practical.
Problem: these files are too large, will take long time before practical is loaded.
Target: files to be used should not exceed 5MB (preferably for the whole package but that is too little).
Failed solution1: select chromosome 19 only --> but this leaves us with bigwig files of ±20-30 MB each! Still too large.
Failed solution2: work with chr21 --> files are small enough but gene expression is not representative for genome wide
Failed solution3: Random select 40% of genes from chr19 based on quantification quartiles and used these to select regions in bigwig file. I thought this would be ok, but running fg2 resulted in a very slow server because of the loaded objects.

Thus, 'situation after the fg2 experience': need to reduce the size of all objects!

Use 50-100 genes add plots for complete chromosome 19 in dropdown box or in feedback to question.
Also subset bed files to be loaded in GRanges objects.

How I obtained the bigwig files used in this script (covering chr19 only):

bigwig files were downloaded from, eg.H3K4me3: http://dcc.blueprint-epigenome.eu/#/experiments/ERX547984 ; file "Normalized signal, BigWig format'.
select chr19 with script: /scratch/ctoenhake/data/fg1_r_data/blueprint/chip_bw/select_chr19.sh
used a bit of a work-around to get these bigwigs on passer:
- copy resulting *.chr19.bw to the web accessible directory that I also used for the trackhubs /ceph/rimlsfnwi/web_share/mbdata/ctoenhake/edu/fg1_r_data/blueprint/chip_bw
- Download file from there to my mac with curl -O [filename] .
- Upload these files to the learn.proto /home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19

After I ran the chunks to get smaller bigwig files, I removed the dir /home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19, and kept /home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19_tss_sample

chr19 - get smaller bigwig by downsampling genes and save associated tss and FPKM files

# Important parameter: the chosen proportion of genes --> replaced by ngenes to reduce the filesize further.
# chosen_proportion <- 0.4


# Important parameter: number of genes that will be used by students for analyses
ngenes <- 60

### set sqlevels in txdb to chr19
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene 
seqlevels(txdb) <- seqlevels0(txdb)
seqlevels(txdb) <- "chr19"

# load chr19 quantification (used in fg2)
rdsfile <- system.file("extdata", "week2", "prepared_rds", "blueprint_c000s5_gene_quantification_chr19.rds", package = "learnr.proto")
quantification_chr19 <- readRDS(rdsfile)
rm(rdsfile)

# define new variable based on quartiles of gene expression
quantification_chr19 <- quantification_chr19 %>% 
  mutate(expression_group = ntile(FPKM, 4))

### set seed for slice_sample
set.seed(10)

# subsample per quartile  
# quantification_chr19_sample <- quantification_chr19 %>% 
#   group_by(expression_group) %>% 
#   slice_sample(prop=chosen_proportion, replace = F)
quantification_chr19_60genes <- quantification_chr19 %>% 
  group_by(expression_group) %>% 
  slice_sample(n=ngenes/4, replace = F)

# check number of genes per quantile and in expression plot
table(quantification_chr19_60genes$expression_group)
ggplot(quantification_chr19, aes(x = as.factor(expression_group), y = log10(FPKM+1)))+
  geom_boxplot()+
  geom_jitter()
ggplot(quantification_chr19_60genes, aes(x = as.factor(expression_group), y = log10(FPKM+1)))+
  geom_boxplot()+
  geom_jitter()

# sample bigwig for regions around the TSS of the sampled genes on chr19  
## make ref tss files:
tss_chr19 <- unique(promoters(genes(txdb), upstream = 2000, downstream = 0 )) 
tss_chr19 <- resize(tss_chr19, width = 1, fix = "end")
tss_chr19$name <- tss_chr19$gene_id
tss_chr19_60genes <- tss_chr19 %>% filter(gene_id %in% quantification_chr19_60genes$entrezgene_id )  
all(quantification_chr19_sample$entrezgene_id %in% tss_chr19_60genes$gene_id)

### input dir:
dir_raw_files <- "/home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19/"
### output dir:
dir_out_files <- "/home/ctoenhake/learnr.proto/inst/extdata/week3/bw_chr19_tss_sample/"

### function to select regions in bigwig files and write output in outdir
bw_files <- dir(path = dir_raw_files, pattern = "*.bw")
resize_bw <- function(w_in_kb, f, tss){
  print(deparse(substitute(f)))
  w_in_bp <- w_in_kb*1000+4
  tss_window <- resize(tss, width = w_in_bp, fix = "center")
  sum(width(tss_window))
  bw_tss_window <- import(paste(dir_raw_files, f, sep = "/"), which = tss_window, as = "RleList")
  bw_out <- file.path(dir_out_files, 
                      paste0(tools::file_path_sans_ext(f), "_chr19tss_", ngenes, "Genes_", w_in_kb, "kb.bw"))
  export.bw(object = bw_tss_window, bw_out)
  rm(tss_window, bw_tss_window, bw_out)
}

## perform function on all files in input directory. 
for(file in bw_files){
resize_bw(10, file, tss_chr19_60genes)
}

chr19 - save objects in week3.Rdata

# make quantification_chr19_60genes (generated in chunk above) ready for saving
quantification_chr19_60genes$expression_group <- NULL
quantification_chr19_60genes_save <- quantification_chr19_60genes
quantification_chr19_60genes_save <- quantification_chr19_60genes_save %>% dplyr::rename(gene_id = entrezgene_id)
quantification_chr19_60genes <- quantification_chr19_60genes_save
rm(quantification_chr19_60genes_save)

# make tss_chr19_60genes  (generated in chunk above)  ready for saving, thus add FPKM to TSS windows such that these intervals can be arranged by FPKM
FPKM_tssorder<- as.vector(NA)
for(i in 1:length(tss_chr19_60genes)) {
  FPKM_tssorder[i] <- quantification_chr19_60genes[which(quantification_chr19_60genes[, "gene_id"] == tss_chr19_60genes$gene_id[i]),]$FPKM
  }
tss_chr19_60genes$FPKM <- FPKM_tssorder
rm(FPKM_tssorder)

# load week2 data and sample H3K4me3 peaks in granges objects to 100 peaks of the complete dataset
rdata = system.file("extdata", "week2.Rdata", package = "learnr.proto")
load(rdata)

monocytes_h3k4me3_metadata <- as.data.frame(mcols(monocytes_h3k4me3)) %>% slice_sample(n = 100, replace = F)
nrow(monocytes_h3k4me3_metadata)
monocytes_h3k4me3_sample <- monocytes_h3k4me3 %>% plyranges::filter(name %in% monocytes_h3k4me3_metadata$name)

# also subset tss_chr19 for the TSSs in this sample
dist_h3k4me3_to_tss <- distance2NearestFeature(monocytes_h3k4me3_sample, tss_chr19)
tss_chr19_ex1 <- tss_chr19 %>% plyranges::filter(gene_id %in% dist_h3k4me3_to_tss$feature.name)

## save granges objects of peaks and sampled/sliced quantification and TSSs
save(monocytes_h3k4me3_sample, tss_chr19_ex1, tss_chr19_60genes, file = "/home/ctoenhake/learnr.proto/inst/extdata/week3.Rdata")