knitr::opts_chunk$set( collapse = TRUE, comment = "#>", echo = TRUE, include = TRUE, eval = FALSE )
This illustrates how to construct the data object for the Sample Agreement QC step -- RLE -- on each of the data sets. In each case, once you have added the design (the last step for each set), you need to write the object to file and send it to the cluster. I suggest mounting to the cluster and writing to the mounted directory. See the vignette Running DESeq on HTCF for instructions on running DESeq on HTCF. Note that at the bottom of this script, there are instructions on how to copy the necessary scripts to run the DESeq model using MPI on htcf.
You can see all defined sets by entering ?createExperimentSet
. If an experiment
set you are interested in does not exist as an option in set_names
, then you'll
need to use the extractColData(blrs)
of the blrs
object below to play around
and come up with a set of filters to define your new set. See the github repository,
"R/ExperimentSetFunctions.R" for examples
library(brentlabRnaSeqTools) library(rtracklayer) library(tidyverse) library(caret) # set variables KN99_GFF_RDS = Sys.getenv("kn99_stranded_gff_rds") DB_USERNAME = Sys.getenv("db_username") DB_PASSWORD = Sys.getenv("db_password") # note: I mount to the cluster and output directly to it DDS_OUTPUT_DIR = Sys.getenv("OUTPUT_DIR") TODAY = format(lubridate::today(),"%Y%m%d") # controls whether dds objects are written WRITE_OUT = TRUE
blrs = brentlabRnaSeqSetFromDatabase('kn99',DB_USERNAME, DB_PASSWORD)
this adds all of the data regarding each locus as a GRange object to the gene data slot of the brentlabRnaSeqSet object. Useful if you are going to use other Bioconductor packages, or brentlabRnaSeqTools::createIgvBrowserShot()
kn99_gff = readRDS(KN99_GFF_RDS) kn99_genes = kn99_gff[kn99_gff$ID %in% rownames(blrs)] rowRanges(blrs) = kn99_genes[order(match(kn99_genes$ID,rownames(blrs)))]
This is the data set defined by the single locus KOs in 90minuteInduction
conditions for genotypes in the grant_df
(this is a data object which is
loaded as part of brentlabRnaSeqTools
-- take a look if you are interested)
blrs_90min_grant = createExperimentSet(blrs, 'ninetyMin_2016Grant') # note that this filters out those samples which failed QC1, # but but does not filter on RLE unless the argument rle_iqr_threshold # is set to a numeric value blrs_90min_grant_fltr = qaFilter(blrs_90min_grant) # remove WT which fall on dates with no perturbed samples blrs_90min_grant_fltr = filterWtByExperimentalLibdate_90min(blrs_90min_grant_fltr)
blrs_90min_grant_fltr = estimateSizeFactorsByProtocol(blrs_90min_grant_fltr) min_libdate = min(as.Date(colData(blrs_90min_grant_fltr)$libraryDate)) colData(blrs_90min_grant_fltr)$libraryDate = colData(blrs_90min_grant_fltr)$libraryDate %>% relevel(ref = as.character(min_libdate)) %>% droplevels() colData(blrs_90min_grant_fltr)$libraryProtocol = colData(blrs_90min_grant_fltr)$libraryProtocol %>% factor() %>% relevel(ref = "SolexaPrep") %>% droplevels() colData(blrs_90min_grant_fltr)$genotype1 = colData(blrs_90min_grant_fltr)$genotype1 %>% relevel(ref = "CKF44_00000") %>% droplevels() mm = model.matrix(~libraryProtocol+libraryDate+genotype1, extractColData(blrs_90min_grant_fltr)) min_new_protocol = min(as.Date((unique(pull(filter( extractColData(blrs_90min_grant_fltr), libraryProtocol == 'E7420L'), libraryDate))))) mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]
lin_dep_report = caret::findLinearCombos(mm)
design(blrs_90min_grant_fltr) = mm rm(mm)
if(WRITE_OUT){ outpath = file.path(DDS_OUTPUT_DIR, "90minDataFreeze", TODAY, "grant_only_input.rds") dds = DESeq2::DESeqDataSetFromMatrix( colData = extractColData(blrs_90min_grant_fltr), countData = counts(blrs_90min_grant_fltr), design = design(blrs_90min_grant_fltr)) sizeFactors(dds) = sizeFactors(blrs_90min_grant_fltr) write_rds(dds, outpath) }
This has all double KO samples in which either of the KO loci are in the grant_df
,
plus single KO samples corresponding to one of the double KO perturbed loci and
WT.
blrs_grant_with_doubles = createExperimentSet(blrs, 'ninetyMin_2016GrantWithDoubles') # note that this filters out those samples which failed QC1, # but but does not filter on RLE unless the argument rle_iqr_threshold # is set to a numeric value blrs_grant_with_doubles_fltr = qaFilter(blrs_grant_with_doubles) # remove WT which fall on dates with no perturbed samples blrs_grant_with_doubles_fltr = filterWtByExperimentalLibdate_90min(blrs_grant_with_doubles_fltr)
blrs_grant_with_doubles_fltr = estimateSizeFactorsByProtocol(blrs_grant_with_doubles_fltr) min_libdate = min(as.Date(colData(blrs_grant_with_doubles_fltr)$libraryDate)) colData(blrs_grant_with_doubles_fltr)$libraryDate = colData(blrs_grant_with_doubles_fltr)$libraryDate %>% relevel(ref = as.character(min_libdate)) %>% droplevels() # add a 'genotype' column which is a concatenation of genotype1 and genotype2 colData(blrs_grant_with_doubles_fltr)$genotype = paste(colData(blrs_grant_with_doubles_fltr)$genotype1, colData(blrs_grant_with_doubles_fltr)$genotype2, sep = "_") %>% str_remove('_$') %>% factor() %>% relevel(ref = "CKF44_00000") %>% droplevels() colData(blrs_grant_with_doubles_fltr)$libraryProtocol = colData(blrs_grant_with_doubles_fltr)$libraryProtocol %>% factor() %>% relevel(ref = "SolexaPrep") %>% droplevels() mm = model.matrix(~libraryProtocol+libraryDate+genotype, extractColData(blrs_grant_with_doubles_fltr)) min_new_protocol = min( as.Date( (unique( pull( filter(extractColData(blrs_grant_with_doubles_fltr), libraryProtocol == 'E7420L'), libraryDate))))) mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]
lin_dep_report = caret::findLinearCombos(mm) lin_dep_report
design(blrs_grant_with_doubles_fltr) = mm rm(mm)
if(WRITE_OUT){ outpath = file.path(DDS_OUTPUT_DIR, "90minDataFreeze", TODAY, "grant_doubles_input.rds") dds = DESeq2::DESeqDataSetFromMatrix( colData = extractColData(blrs_grant_with_doubles_fltr), countData = counts(blrs_grant_with_doubles_fltr), design = design(blrs_grant_with_doubles_fltr)) sizeFactors(dds) = sizeFactors(blrs_grant_with_doubles_fltr) write_rds(dds, outpath) }
This set is all samples -- singles and doubles -- which are in the 90minuteInduction conditions.
blrs_90min_all = createExperimentSet(blrs, 'ninetyMin_all') # note that this filters out those samples which failed QC1, # but but does not filter on RLE unless the argument rle_iqr_threshold # is set to a numeric value blrs_90min_all_fltr = qaFilter(blrs_90min_all) # remove WT which fall on dates with no perturbed samples blrs_90min_all_fltr = filterWtByExperimentalLibdate_90min(blrs_90min_all_fltr)
blrs_90min_all_fltr = estimateSizeFactorsByProtocol(blrs_90min_all_fltr) min_libdate = min(as.Date(colData(blrs_90min_all_fltr)$libraryDate)) colData(blrs_90min_all_fltr)$libraryDate = colData(blrs_90min_all_fltr)$libraryDate %>% relevel(ref = as.character(min_libdate)) %>% droplevels() # add a 'genotype' column which is a concatenation of genotype1 and genotype2 colData(blrs_90min_all_fltr)$genotype = paste(colData(blrs_90min_all_fltr)$genotype1, colData(blrs_90min_all_fltr)$genotype2, sep = "_") %>% str_remove('_$') %>% factor() %>% relevel(ref = "CKF44_00000") %>% droplevels() colData(blrs_90min_all_fltr)$libraryProtocol = colData(blrs_90min_all_fltr)$libraryProtocol %>% factor() %>% relevel(ref = "SolexaPrep") %>% droplevels() mm = model.matrix(~libraryProtocol+libraryDate+genotype, extractColData(blrs_90min_all_fltr)) min_new_protocol = min( as.Date( (unique( pull( filter(extractColData(blrs_90min_all_fltr), libraryProtocol == 'E7420L'), libraryDate))))) mm = mm[,-which(colnames(mm) == paste0('libraryDate',min_new_protocol))]
lin_dep_report = caret::findLinearCombos(mm) lin_dep_report
design(blrs_90min_all_fltr) = mm rm(mm)
if(WRITE_OUT){ outpath = file.path(DDS_OUTPUT_DIR, "90minDataFreeze", TODAY, "90min_all_input.rds") dds = DESeq2::DESeqDataSetFromMatrix( colData = extractColData(blrs_90min_all_fltr), countData = counts(blrs_90min_all_fltr), design = design(blrs_90min_all_fltr)) sizeFactors(dds) = sizeFactors(blrs_90min_all_fltr) write_rds(dds, outpath) }
ep_list = list( wt = createExperimentSet(blrs, 'envPert_epWT'), titration = createExperimentSet(blrs, 'envPert_titrationWT'), perturbed = createExperimentSet(blrs, 'envPert_perturbed') ) # NOTE!! AS OF 20220201 CKF44_03894 forms a linear depdendence with both # concat treatment and libraryDate columns. it is being removed here to solve # that issue ep_list$perturbed = ep_list$perturbed[,ep_list$perturbed$genotype1 != "CKF44_03894"]
ep_list_qc_passing = map(ep_list, qaFilter)
# How this is done is up to you, and obviously affects what genes are left in. # Below is an example. You need to think about the thresholds and filter method # that suits your purpose best. expr_fltr_list = map(ep_list, ~rowSums(edgeR::cpm(counts(.))>4) >= 4) ep_list_qc_passing_fltr = map2(ep_list_qc_passing, expr_fltr_list, ~.x[.y,])
setBaseConcatTreatmentBaseCond = function(ep_set, concat_base_cond){ colData(ep_set)$concat_treatment = relevel(colData(ep_set)$concat_treatment, ref = concat_base_cond) colData(ep_set) = droplevels(DataFrame(colData(ep_set))) ep_set } setEpDesign = function(ep_set, design){ design(ep_set) = design ep_set } concat_base_cond_list = list( wt = "YPD_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_30", titration = "RPMI_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_30", perturbed = 'PBS_noAtmosphere_30_noTreatment_noTreatmentConc_noPH_0' ) ep_designs = list( wt = formula(~libraryDate + concat_treatment), titration = formula(~libraryDate + concat_treatment), perturbed = formula(~libraryDate + concat_treatment + genotype1) ) ep_list_qc_passing_fltr = map2(ep_list_qc_passing_fltr, concat_base_cond_list, setBaseConcatTreatmentBaseCond) ep_list_qc_passing_fltr = map2(ep_list_qc_passing_fltr, ep_designs, setEpDesign)
ep_dds_list = map(ep_list_qc_passing_fltr, coerceToDds)
DDS_OUTPUT_DIR = "/mnt/scratch/rnaseq_pipeline/experiments/epTally" if(WRITE_OUT){ today = format(lubridate::today(),"%Y%m%d") output_dir = file.path(DDS_OUTPUT_DIR, today) dir.create(output_dir, recursive=TRUE) map(names(ep_dds_list), ~write_rds(ep_dds_list[[.]], file.path(output_dir, paste0("ep_",.,".rds")))) }
These scripts will run DESeq in parallel on HTCF. The only items you'l need to edit is the path to the lookup file (if you are running more than one model), and to the dds_input in deseq_mpi.sh. If you do not keep deseq_de.R in the same directory as deseq_mpi.sh, then you'll need to update the path to the deseq_de.R script, also.
if(WRITE_OUT){ htcf_deseq_scripts = c(system.file('bash', 'htcf_parallel_deseq.sh', package = "brentlabRnaSeqTools"), system.file('R_executable', 'deseq_de.R', package = "brentlabRnaSeqTools")) lapply(htcf_deseq_scripts, file.copy, to = DDS_OUTPUT_DIR) }
Note: this entirely depends on what set you want to look at -- in this example, I am doing the sample agreement and tallies for the EP sets
DESEQ_OUTPUT_LIST = list( ep_wt = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_wt_20220201_output.rds", ep_titration = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_titration_20220201_output.rds", ep_perturbed = "/mnt/scratch/rnaseq_pipeline/experiments/epTally/20220201/ep_perturbed_20220201_output.rds" )
# get deseq output object ep_dds_list = list( ep_wt = readRDS(DESEQ_OUTPUT_LIST$ep_wt), ep_titr = readRDS(DESEQ_OUTPUT_LIST$ep_titration), ep_pert = readRDS(DESEQ_OUTPUT_LIST$ep_perturbed) )
ep_dds_list$ep_pert$rep_col = paste(ep_dds_list$ep_pert$genotype1, ep_dds_list$ep_pert$concat_treatment, sep = "_")
ep_rle_list = list( ep_wt = removeLibdateByReplicate(ep_dds_list$ep_wt, "concat_treatment"), ep_titr = removeLibdateByReplicate(ep_dds_list$ep_titr, "concat_treatment"), ep_pert = removeLibdateByReplicate(ep_dds_list$ep_pert, "rep_col") )
if(UPDATE_DB){ updateRleTable = function(set_name, set_rle_summary){ iqr_col = paste0(set_name, "_iqr") med_dev_col = paste0(set_name, "_median_dev") update_df = set_rle_summary %>% dplyr::select(replicateAgreementNumber, rle_iqr, rle_median_deviation) %>% dplyr::rename(!!quo_name(iqr_col) := rle_iqr, !!quo_name(med_dev_col) := rle_median_deviation) res = patchTable( database_info$kn99$urls$replicateAgreement, Sys.getenv("kn99_db_token"), update_df, "replicateAgreementNumber") res } res_list = map(names(ep_rle_list), ~updateRleTable(., ep_rle_list[[.]]$without_libdate_effect$summary)) # check status ---- ## failures should only be those without enough replicates to calculate RLE extract_status = map(res_list, ~map(., ~as.numeric(.$status_code))) extract_status[[1]][unlist(extract_status[[1]]) != 200] extract_status[[2]][unlist(extract_status[[2]]) != 200] extract_status[[3]][unlist(extract_status[[3]]) != 200] }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.