knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, results='hide')

seqqc overview

This is still pretty rough, but what it does is generate qc figures as long as you can generate data.tables that define file paths with "name" attributes and "treatment" attributes for those files.

Currently, definitions for fastq, bam, bigwig, and peak files are required but this will eventually be more flexible.

options(mc.cores = 10)
library(seqqc)
library(seqsetvis)
library(BiocFileCache)
bfc = BiocFileCache()
bfcif = ssvRecipes::bfcif
wd = c("/slipstream/galaxy/uploads/working/qc_framework/output",
       "/slipstream/galaxy/uploads/working/qc_framework/output_CG_BRCA_progression")
fastq_dir = dir(wd, full.names = TRUE)
fastq_files = dir(fastq_dir, 
                  pattern = "fastq.gz$", 
                  full.names = TRUE)
peak_dir = dir(file.path(wd), full.names = "TRUE")
peak_files = dir(peak_dir, 
                 pattern = "passIDR.05.+Peak$", 
                 full.names = TRUE)
bam_dir = peak_dir
bam_files = dir(bam_dir, pattern = "bam$", 
                full.names = TRUE)
bw_dir = peak_dir
bw_files = dir(bw_dir, pattern = "ed_FE.bw$", 
               full.names = TRUE)
fix_names = function(in_f){
  f = basename(in_f)
  # if(grepl("fastq", f)){
  #   # f = sapply(strsplit(f, "_"), function(x){
  #   #   paste0(x[1], "_", x[2], "_rep", x[3])
  #   # })
  # }else{
  f = sub("\\..+", "", f)  
  # }
  f = sub("_peaks_passIDR", "", f)
  f = sub("_rep", "_R", f)
  f = sub("_R", "_rep", f)
  # f = sub("Frietze_Cohrs_11497_180409_SNL128_0174_ACC3JAACXX_", "", f)
  # f = sub("Control_HFL_AntiCTCF_S31_L005.+", "HFL_con_CTCF_set1", f)
  # f = sub("Control_HFL_Input_S30_L005.+", "HFL_con_in_set1", f)
  # f = sub("VZV_AntiCTCF_S33_L005.+", "HFL_VZV_CTCF_set1", f)
  # f = sub("VZV_Input_S32_L005.+", "HFL_VZV_in_set1", f)
  # #look behind (?<=set[0-9])_.+
  # f = sub("(?<=set[0-9])_.+", "", f, perl = TRUE)

  toupper(f)
}

names(bw_files) = fix_names(bw_files)
names(bam_files) = fix_names(bam_files)
names(peak_files) = fix_names(peak_files)
names(fastq_files) = fix_names(fastq_files)
filter_files = function(f){
  f = f[grepl("K4ME3", names(f))]
  f = f[!grepl("not_used", f)]
  f
}

bw_files.f = filter_files(bw_files)
bam_files.f = filter_files(bam_files)
bam_files.f = bam_files.f[grepl("REP", names(bam_files.f))]
peak_files.f = filter_files(peak_files)
fastq_files.f = filter_files(fastq_files)
#using rep as treatment here becuase that looks more interesting
my_make_config_dt = function(files){
  config_dt = data.table(file = files, name = names(files))
  config_dt[, c("cell", "mark", "rep") := tstrsplit(name, "_", keep = 1:3)]
  config_dt[, treatment := cell]
  config_dt = config_dt[order(rep)][order(treatment)][order(cell)]
  config_dt$name = factor(config_dt$name, levels = config_dt$name)
  config_dt
}
fastq_config_dt = my_make_config_dt(fastq_files.f)
peak_config_dt = my_make_config_dt(peak_files.f)
bam_config_dt = my_make_config_dt(bam_files.f)
bw_config_dt = my_make_config_dt(bw_files.f)

stopifnot(file.exists(fastq_config_dt$file))
stopifnot(file.exists(peak_config_dt$file))
stopifnot(file.exists(bam_config_dt$file))
stopifnot(file.exists(bw_config_dt$file))

Nothing beyond here should need to be edited.

fq_dt = bfcif(bfc, rname = paste("fastq_dt", digest::digest(list(fastq_config_dt))), function(){
  make_fq_dt(fastq_config_dt, cache_counts = F)
})

plot_fq_dt(fq_dt)
peak_grs = easyLoad_narrowPeak(peak_config_dt$file, file_names = peak_config_dt$name)
olaps_gr = ssvOverlapIntervalSets(peak_grs)
cons2_gr = ssvConsensusIntervalSets(peak_grs, min_number = 2, min_fraction = 0)
set.seed(0)
asses_gr = resize(sample(cons2_gr, 5e3), 600, fix = "center")
asses_gr = prepare_fetch_GRanges_names(asses_gr)
feature_plots = plot_feature_comparison(peak_grs)
cowplot::plot_grid(plotlist = feature_plots$all)
cowplot::plot_grid(plotlist = feature_plots$consensus)

For paired end data, you should add flag = Rsamtools::scanBamFlag(isFirstMateRead = TRUE) to the make_scc_dt call. This flag will cause results to be empty for SE data though.

scc_dt = make_scc_dt(bam_config_dt, query_gr = asses_gr)
fl = round(mean(scc_dt$fragment_length[grepl("CTCF", name)]$fragment_length))

scc_plots = plot_scc_dt(scc_dt)

scc_plots$scc_curves + facet_wrap("~name", ncol = 3)
scc_plots$scc_dots + facet_wrap("~name", ncol = 3)
view_size = 3e3
asses_gr.c = bfcif(bfc, rname = paste("center_qgr", digest::digest(list(bw_config_dt, asses_gr, view_size, fl))), function(){
  make_centered_query_gr(bw_config_dt, asses_gr, view_size = view_size)
})
frip_dt = bfcif(bfc, rname = paste("frip_dt", digest::digest(list(bam_config_dt, asses_gr.c))), function(){
  make_frip_dt(bam_config_dt, asses_gr.c)
})
frip_dt$name = factor(frip_dt$name, levels = levels(bam_config_dt$name))
plot_frip_dt(frip_dt, name_lev = levels(bam_config_dt$name))
anno_grs = make_anno_grs("/slipstream//home/joeboyd/gencode.v35.annotation.gtf.gz")
anno_dt = make_anno_dt(peak_grs, anno_grs)
plot_anno_overlap(anno_dt)
prof_dt.bam = ssvFetchBam(bam_config_dt, asses_gr.c, return_data.table = TRUE, n_region_splits = 20, fragLens = 120)
prof_dt.bam[, y_relative := y / max(y), list(id)]
clust_dt.bam = ssvSignalClustering(prof_dt.bam, nclust = 6, max_cols = Inf, fill_ = "y_relative")
assign_dt.bam = make_assign_dt(clust_dt.bam)

signal_plots.bam = plot_signals(prof_dt.bam, asses_gr.c, assign_dt = assign_dt.bam, frip_dt = frip_dt, scc_dt = scc_dt, anno_grs = anno_grs)
signal_plots.bam[-1]
prof_dt.bw = ssvFetchBigwig(bw_config_dt, asses_gr.c, return_data.table = TRUE, n_region_splits = 20)
# clust_dt.bw = ssvSignalClustering(prof_dt.bw, nclust = 6, max_cols = Inf)
# assign_dt.bw = make_assign_dt(clust_dt.bw)
signal_plots.bw = plot_signals(prof_dt.bw, assign_dt = assign_dt.bam)
signal_plots.bw[2]


FrietzeLabUVM/ssvQC documentation built on March 25, 2024, 12:24 a.m.