knitr::opts_chunk$set( collapse = TRUE, comment = "#>", echo = TRUE, include = TRUE, eval = FALSE )
The BrentlabRnaSeqSet
is a child of the DESeqDataSet, which is a child of
SummarizedExperiment.
Therefore, the brentlabRnaSeqSet
has all of the DESeq functionality -- eg,
DESeq size factor normalization, or the DESeq()
function -- as well as all of
the SummarizedExperiment methods, and some more 'custom' methods more suited to
our purposes. It is easily extensible -- for those more computationally minded
users, it would be a good idea to learn and use the object oriented programming
tools. There are many cases in which doing so will make your code less error
prone, easier to test, more reproducible, and easier to maintain long term.
library(brentlabRnaSeqTools) library(rtracklayer) library(tidyverse) # set variables KN99_GFF_PATH = Sys.getenv("kn99_stranded_gff") DB_USERNAME = Sys.getenv("db_username") DB_PASSWORD = Sys.getenv("db_password") # note: I mount to the cluster and output directly to it DDS_OUTPUT_DIR = "/mnt/htcf_scratch/chasem/rnaseq_pipeline/experiments/90minDataFreeze" # controls whether dds objects are written WRITE_OUT = FALSE # Pull the database ---- blrs = brentlabRnaSeqSetFromDatabase('kn99',DB_USERNAME, DB_PASSWORD) # filter down to just the protein coding genes. Note that this works because the # nctr-rna annotations were all added to the end of the annotation file. blrs = blrs[1:6967,] # Add gene level data (optional) ---- # this adds all of the data regarding each locus as a GRange object to the # gene data slot of the brentlabRnaSeqSet object. Useful if you are going to # use other Bioconductor packages. kn99_gff = rtracklayer::import(KN99_GFF_PATH) kn99_genes = kn99_gff[kn99_gff$ID %in% rownames(blrs)] rowRanges(blrs) = kn99_genes[order(match(kn99_genes$ID,rownames(blrs)))] rownames(blrs) = rowData(blrs)$ID
If the experiment set you want is not already created, issue an 'issue report', and describe the EXACT filter that you want to use to create the set. What goes into your set depends on what you describe as your filter, so spend time with the database to figure out what is there.
blrs_90min = createExperimentSet(blrs, 'ninetyMin_2016Grant') # Quality Filter ---- blrs_90min_qc_passing = qaFilter(blrs_90min, rle_iqr_threshold = .6125, iqr_colname = "ninetyMin_iqr")
blrs_90min_qc_passing = setPerturbedLociToZero(blrs_90min_qc_passing)
Filter out low expression genes (and anything else you don't want in the
analysis set). How you do this is up to you, the analyst. You might use the method
proposed in this paper by,
among others, Wolfgang Huber, and implemented in the
bioconductor package
geneFilter
(as well as DESeq, by default), or you
could filter based on some number of samples having greater than some threshold
of expression, which is shown below.
# How this is done is up to you, and obviously affects what genes are left in. # Below is an example. You need to think about the thresholds and filter method # that suits your purpose best. # mid_expression_filter <- rowSums(edgeR::cpm(counts(blrs_90min_qc_passing))>4) >= 4 high_disp_fltr = rownames(blrs_90min_qc_passing) %in% passing_genes_all$gene_id blrs_90min_qc_passing = blrs_90min_qc_passing[high_disp_fltr,]
# note, qaFilter returns those samples with less than 3 replicates, which have NA # in the RLE stats. Filter those out, also blrs_90min_qc_passing = blrs_90min_qc_passing[,!is.na(colData(blrs_90min_qc_passing)$ninetyMin_iqr)] protocol_tallies = replicateByProtocolTally(blrs_90min_qc_passing) # protocol_tallies$replicates_with_less_than_four_in_both_old_or_new. All of the # genotypes should be in the list below protocol_fltr = as_tibble(colData(blrs_90min_qc_passing)) %>% filter(!(genotype1 == "CKF44_00031" & libraryProtocol == "SolexaPrep"), !(genotype1 == "CKF44_00871" & libraryProtocol == "E7420L"), !(genotype1 == "CKF44_00883" & libraryProtocol == "E7420L"), !(genotype1 == "CKF44_01626" & libraryProtocol == "E7420L"), !(genotype1 == "CKF44_02774" & libraryProtocol == "E7420L"), !(genotype1 == "CKF44_03018" & libraryProtocol == "SolexaPrep"), !(genotype1 == "CKF44_03279" & libraryProtocol == "SolexaPrep"), !(genotype1 == "CKF44_03849" & libraryProtocol == "E7420L"), !(genotype1 == "CKF44_04353" & libraryProtocol == "E7420L"), !(genotype1 == "CKF44_05222" & libraryProtocol == "SolexaPrep")) %>% pull(fastqFileNumber) blrs_90min_qc_passing_protocol_fltr = blrs_90min_qc_passing[,colData(blrs_90min_qc_passing)$fastqFileNumber %in% protocol_fltr] colData(blrs_90min_qc_passing_protocol_fltr)$libraryDate = droplevels(colData(blrs_90min_qc_passing_protocol_fltr)$libraryDate)
# note that the column names may not reflect what is actually pasesd -- this # function needs some updating due to hard coding colnames. Columns are in order of the # tables passed in protocol_fltr_tally = createInductionSetTally(as_tibble(colData(blrs_90min)), as_tibble(colData(blrs_90min_qc_passing)), as_tibble(colData(blrs_90min_qc_passing_protocol_fltr)), grant_df)
blrs_90min_qc_passing_protocol_fltr_split = splitProtocolGroups(blrs_90min_qc_passing_protocol_fltr) # libraryprotocol date date have their levels dropped. genotype1 still needs it # this needs to be handled internally somehow blrs_90min_qc_passing_protocol_fltr_split$SolexaPrep$genotype1 = droplevels(blrs_90min_qc_passing_protocol_fltr_split$SolexaPrep$genotype1) blrs_90min_qc_passing_protocol_fltr_split$E7420L$genotype1 = droplevels(blrs_90min_qc_passing_protocol_fltr_split$E7420L$genotype1)
# filter out those samples with less than 3 replicates in either libraryDate or # genotype1 # helper function to create model matricies by protocol createFullSetModelMatricies = function(full_set_split){ full_set_mm_list = list( E7420L = model.matrix(~libraryDate+genotype1, as_tibble(colData(full_set_split$E7420L))), SolexaPrep = model.matrix(~libraryDate+genotype1, as_tibble(colData(full_set_split$SolexaPrep)))) full_set_mm_list } # return those replicate sets which have less than 2 replicates lowReplicateParams = function(model_matrix){ mm_summary_df = tibble(model_params = colnames(model_matrix), replicate_tally=colSums(model_matrix)) low_rep_parameters = mm_summary_df %>% filter(replicate_tally < 2) %>% pull(model_params) low_rep_parameters = str_remove(low_rep_parameters, "libraryDate") low_rep_parameters = str_remove(low_rep_parameters, "genotype1") return(low_rep_parameters) } # create model matricies from the blrs_90min_qc_passing_protocol_fltr_split full_set_mm_list = createFullSetModelMatricies(blrs_90min_qc_passing_protocol_fltr_split) # find low rep parameters low_rep_parameters = lapply(full_set_mm_list ,lowReplicateParams) # all dates, no genotypes low_rep_parameters = lapply(low_rep_parameters, as.factor) # create filters for each protocol set e7420l_fltr = !colData(blrs_90min_qc_passing_protocol_fltr_split$E7420L)$libraryDate %in% low_rep_parameters$E7420L solexaprep_fltr = !colData(blrs_90min_qc_passing_protocol_fltr_split$SolexaPrep)$libraryDate %in% low_rep_parameters$SolexaPrep # create the full set list fltr_full_set_90min = list( E7420L = blrs_90min_qc_passing_protocol_fltr_split$E7420L[, e7420l_fltr], SolexaPrep = blrs_90min_qc_passing_protocol_fltr_split$SolexaPrep[, solexaprep_fltr] ) # drop the factor levels which no longer exist from the filtered libraryDate column colData(fltr_full_set_90min$E7420L)$libraryDate = droplevels(colData(fltr_full_set_90min$E7420L)$libraryDate) colData(fltr_full_set_90min$SolexaPrep)$libraryDate = droplevels(colData(fltr_full_set_90min$SolexaPrep)$libraryDate) # no more low rep sets left fltr_full_set_mm = createFullSetModelMatricies(fltr_full_set_90min) low_rep_parameters_2 = lapply(fltr_full_set_mm, lowReplicateParams)
protocol_fltr_tally = createInductionSetTally(as_tibble(colData(blrs_90min)), as_tibble(colData(blrs_90min_qc_passing)), rbind(as_tibble(colData(fltr_full_set_90min$E7420L)), as_tibble(colData(fltr_full_set_90min$SolexaPrep))), grant_df)
min_set_size = 1 hold_out_set = list( SolexaPrep = test_train_partition(fltr_full_set_90min$SolexaPrep, min_set_size), E7420L = test_train_partition(fltr_full_set_90min$E7420L, min_set_size) )
setNinetyMinDesign = function(obj){ obj$genotype1 = as.factor(obj$genotype1) relevel(obj$genotype1, ref = "CKF44_00000") obj$libraryDate = as.factor(obj$libraryDate) relevel(obj$libraryDate, ref = min(as.character(obj$libraryDate))) design(obj) = formula(~libraryDate + genotype1) obj } fltr_full_set_90min = map(fltr_full_set_90min, setNinetyMinDesign) hold_out_set$SolexaPrep$train = setNinetyMinDesign(hold_out_set$SolexaPrep$train) hold_out_set$E7420L$train = setNinetyMinDesign(hold_out_set$E7420L$train)
hold_out_set$SolexaPrep = map(hold_out_set$SolexaPrep, coerceToDds) hold_out_set$E7420L = map(hold_out_set$E7420L, coerceToDds) fltr_full_set_90min = map(fltr_full_set_90min, coerceToDds)
refactorDesign = function(dds){ dds$libraryProtocol = dds$libraryProtocol %>% as.character() %>% as.factor() %>% droplevels() dds$libraryDate = dds$libraryDate %>% as.Date() %>% as.factor() %>% droplevels() dds$genotype1 = dds$genotype1 %>% as.character() %>% as.factor() %>% droplevels() mm = model.matrix(~libraryProtocol + libraryDate + genotype1, as_tibble(colData(dds))) min_date = colData(dds) %>% as_tibble() %>% filter(libraryProtocol == "E7420L") %>% pull(libraryDate) %>% as.Date() %>% min() mm_redux = mm[,-which(colnames(mm) == paste0("libraryDate", min_date)), drop=FALSE] design(dds) = mm_redux dds } full_set_90min_both_protocols = cbind(fltr_full_set_90min$SolexaPrep, fltr_full_set_90min$E7420L) sizeFactors(full_set_90min_both_protocols) = c(estimateSizeFactors(fltr_full_set_90min$SolexaPrep)$sizeFactor, estimateSizeFactors(fltr_full_set_90min$E7420L)$sizeFactor) full_set_90min_both_protocols = refactorDesign(full_set_90min_both_protocols) train_set_90min_both_protocols = cbind(hold_out_set$SolexaPrep$train, hold_out_set$E7420L$train) sizeFactors(train_set_90min_both_protocols) = c(estimateSizeFactors(hold_out_set$SolexaPrep$train)$sizeFactor, estimateSizeFactors(hold_out_set$E7420L$train)$sizeFactor) train_set_90min_both_protocols = refactorDesign(train_set_90min_both_protocols) test_data = list( SolexaPrep = hold_out_set$SolexaPrep$test, E7420L = hold_out_set$E7420L$test )
if(WRITE_OUT){ today = format(lubridate::today(),"%Y%m%d") output_dir = file.path(DDS_OUTPUT_DIR, today) dir.create(output_dir, recursive=TRUE) # note: this is probably better done with a function and map() as the list gets # longer write_rds(fltr_full_set_90min$E7420L, file.path(output_dir,"full_set_new_protocol_input.dds")) write_rds(fltr_full_set_90min$SolexaPrep, file.path(output_dir, "full_set_old_protocol_input.dds")) write_rds(test_data, file.path(output_dir, "test_data.rds")) write_rds(hold_out_set$E7420L$train, file.path(output_dir, "train_set_new_protocol_input.rds")) write_rds(hold_out_set$SolexaPrep$train, file.path(output_dir, "train_set_old_protocol_input.rds")) write_rds(full_set_90min_both_protocols, file.path(output_dir, "full_set_both_protocol_input.rds")) write_rds(train_set_90min_both_protocols, file.path(output_dir, "train_set_both_protocol_input.rds")) }
These scripts will run DESeq in parallel on HTCF. The only items you'l need to edit is the path to the lookup file (if you are running more than one model), and to the dds_input in deseq_mpi.sh. If you do not keep deseq_de.R in the same directory as deseq_mpi.sh, then you'll need to update the path to the deseq_de.R script, also.
if(WRITE_OUT){ htcf_deseq_scripts = c(system.file('bash', 'htcf_parallel_deseq.sh', package = "brentlabRnaSeqTools"), system.file('R_executable', 'deseq_de.R', package = "brentlabRnaSeqTools")) lapply(htcf_deseq_scripts, file.copy, to = DDS_OUTPUT_DIR) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.