knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, results='hide')

R Markdown

options(mc.cores = 10)
library(seqqc)
library(seqsetvis)
library(BiocFileCache)
bfc = BiocFileCache()
bfcif = ssvRecipes::bfcif
wd = "/slipstream/home/arrichman/prebALL/ATAC/sorted_allreps/ATACseq"
fastq_dir = file.path(wd, "210222_A00405_0352_AHT2VWDSXY/Richman")
fastq_files = dir(fastq_dir, 
                  pattern = "R1_001.fastq.gz$", 
                  full.names = TRUE)
peak_dir = dir(file.path(wd), full.names = "TRUE")
peak_files = dir(peak_dir, 
                 pattern = ".final.peaks$", 
                 full.names = TRUE)
bam_dir = peak_dir
bam_files = dir(bam_dir, pattern = "final.bam$", 
                full.names = TRUE)
bw_dir = peak_dir
bw_files = dir(bw_dir, pattern = "[0-9].bw$", 
               full.names = TRUE)
fix_names = function(in_f){
  f = basename(in_f)
  if(grepl("fastq", f)){
    f = sapply(strsplit(f, "_"), function(x){
      paste0(x[1], "_", x[2], "_rep", x[3])
    })
  }else{
    f = sub("\\..+", "", f)  
  }

  # f = sub("Frietze_Cohrs_11497_180409_SNL128_0174_ACC3JAACXX_", "", f)
  # f = sub("Control_HFL_AntiCTCF_S31_L005.+", "HFL_con_CTCF_set1", f)
  # f = sub("Control_HFL_Input_S30_L005.+", "HFL_con_in_set1", f)
  # f = sub("VZV_AntiCTCF_S33_L005.+", "HFL_VZV_CTCF_set1", f)
  # f = sub("VZV_Input_S32_L005.+", "HFL_VZV_in_set1", f)
  # #look behind (?<=set[0-9])_.+
  # f = sub("(?<=set[0-9])_.+", "", f, perl = TRUE)

  f
}

names(bw_files) = fix_names(bw_files)
names(bam_files) = fix_names(bam_files)
names(peak_files) = fix_names(peak_files)
names(fastq_files) = fix_names(fastq_files)
my_make_config_dt = function(files){
  config_dt = data.table(file = files, name = names(files))
  config_dt[, c("cell", "treatment", "rep") := tstrsplit(name, "_")]
  config_dt = config_dt[order(rep)][order(treatment)][order(cell)]
  config_dt$name = factor(config_dt$name, levels = config_dt$name)
  config_dt
}
fastq_config_dt = my_make_config_dt(fastq_files)
peak_config_dt = my_make_config_dt(peak_files)
bam_config_dt = my_make_config_dt(bam_files)
bw_config_dt = my_make_config_dt(bw_files)

stopifnot(file.exists(fastq_config_dt$file))
stopifnot(file.exists(peak_config_dt$file))
stopifnot(file.exists(bam_config_dt$file))
stopifnot(file.exists(bw_config_dt$file))
fq_dt = bfcif(bfc, rname = paste("fastq_dt", digest::digest(list(fastq_config_dt))), function(){
  make_fq_dt(fastq_config_dt, cache_counts = FALSE)
})

plot_fq_dt(fq_dt)
peak_grs = easyLoad_narrowPeak(peak_config_dt$file, file_names = peak_config_dt$name)
olaps_gr = ssvOverlapIntervalSets(peak_grs)
cons2_gr = ssvConsensusIntervalSets(peak_grs, min_number = 3, min_fraction = 0)
set.seed(0)
asses_gr = resize(sample(cons2_gr, 5e3), 600, fix = "center")
asses_gr = prepare_fetch_GRanges_names(asses_gr)
feature_plots = plot_feature_comparison(peak_grs)
cowplot::plot_grid(plotlist = feature_plots$all)
cowplot::plot_grid(plotlist = feature_plots$consensus)
scc_dt = make_scc_dt(bam_config_dt, query_gr = asses_gr, flag = Rsamtools::scanBamFlag(isFirstMateRead = TRUE))
fl = round(mean(scc_dt$fragment_length[grepl("CTCF", name)]$fragment_length))

scc_plots = plot_scc_dt(scc_dt)

scc_plots$scc_curves + facet_wrap("~name", ncol = 3)
scc_plots$scc_dots + facet_wrap("~name", ncol = 3)
view_size = 3e3
asses_gr.c = bfcif(bfc, rname = paste("center_qgr", digest::digest(list(bw_config_dt, asses_gr, view_size, fl))), function(){
  make_centered_query_gr(bw_config_dt, asses_gr, view_size = view_size)
})
frip_dt = bfcif(bfc, rname = paste("frip_dt", digest::digest(list(bam_config_dt, asses_gr.c))), function(){
  make_frip_dt(bam_config_dt, asses_gr.c)
})
frip_dt$name = factor(frip_dt$name, levels = levels(bam_config_dt$name))
plot_frip_dt(frip_dt, name_lev = levels(bam_config_dt$name))
anno_grs = make_anno_grs("/slipstream//home/joeboyd/gencode.v35.annotation.gtf.gz")
anno_dt = make_anno_dt(peak_grs, anno_grs)
plot_anno_overlap(anno_dt)
prof_dt.bam = ssvFetchBam(bam_config_dt, asses_gr.c, return_data.table = TRUE, n_region_splits = 20, fragLens = 120)
clust_dt.bam = ssvSignalClustering(prof_dt.bam, nclust = 6, max_cols = Inf)
assign_dt.bam = make_assign_dt(clust_dt.bam)

signal_plots.bam = plot_signals(prof_dt.bam, asses_gr.c, assign_dt = assign_dt.bam, frip_dt = frip_dt, scc_dt = scc_dt, anno_grs = anno_grs)
signal_plots.bam[-1]
prof_dt.bw = ssvFetchBigwig(bw_config_dt, asses_gr.c, return_data.table = TRUE, n_region_splits = 20)
# clust_dt.bw = ssvSignalClustering(prof_dt.bw, nclust = 6, max_cols = Inf)
# assign_dt.bw = make_assign_dt(clust_dt.bw)
signal_plots.bw = plot_signals(prof_dt.bw, assign_dt = assign_dt.bam)
signal_plots.bw[2]


FrietzeLabUVM/ssvQC documentation built on March 25, 2024, 12:24 a.m.