#' @title Set up a directory tree for ConcensusGLM
#' @description Set up a directory tree for ConcensusGLM
#' @param path Character. Path to root of directory tree
#' @return list of class "directoryTree"
#' @export
directoryTree <- function ( path='.' ) {
top.dir <- file.path(path, 'concensusGLM-results')
csv.dir <- 'tables'
csv.path <- file.path(top.dir, csv.dir)
dir.create(csv.path, showWarnings=FALSE, recursive=TRUE)
return.list <- structure(list(
top=top.dir,
tables=csv.path
), class='directoryTree')
return ( return.list )
}
#' @title Set up analysis container for ConcensusGLM
#' @description Set up analysis container for ConcensusGLM. This is the workhorse function; most users should use
#' \link{concensusDataSetFromFile}.
#' @param checkpoint Logical. Save intermediate results as checkpoints?
#' @param working_directory Character. Path to where you want the analysis output.
#' @param load_checkpoint If \code{"latest"}, reload from latest checkpoint files in \code{working_directory}.
#' If a string pattern matching existing checkpoint, then loads those files. If \code{NULL} (the default) then doesn nothing.
#' @return list of class "concensusDataSet"
#' @seealso \link{workflows::newWorkflow}, \link{concensusDataSetFromFile}
#' @export
newConcensusDataSet <- function(checkpoint=FALSE, working_directory='.', load_checkpoint=NULL, ...) {
workflow <- workflows::newWorkflow(structure(list(...), class='concensusDataSet'),
checkpoint=checkpoint, working_directory=working_directory,
load_checkpoint=load_checkpoint)
class(workflow) <- c('concensusWorkflow', class(workflow))
#print(str(workflow))
return ( workflow )
}
#' @title Set up analysis container for ConcensusGLM
#' @description Set up analysis container for ConcensusGLM using path to data as input. Performs validity checks on the input.
#' Optionally annotates input with experimental metadata from \code{annotation_filename}.
#' @details This creates the concensusDataSet object from \code{data_filename} on which downstream analysis is carried out.
#'
#' The input CSV should be found at \code{data_filename}. This CSV should at least have the headers
#' \code{'id', 'compound', 'concentration', 'strain', 'plate_name', 'count', 'well'}, and must have one row per
#' strain-compound-concentration-plate_name-well combination. Together, "id", "plate_name" and "well" define unique experimental
#' samples; "id" refers to sequencing (technical) replicates (if any) and "plate_name" and "well" are biological replicates
#' (recommended). Any given condition should have at least 2 replicates of some kind. If "row" and "column"
#' are present but not "well", then well is constructed by concatenating "row" and "column".
#'
#' Firstly, this function loads a CSV from \code{data_filename}. This may take some time if it is a large file. It then checks
#' that the minimum headers are present.
#'
#' It then checks for either a \code{negative_control} column or a list of control compounds supplied to \code{controls} argument.
#'
#' Under-represented (assumed to be spurious) strains and plates (as defined by \code{threshold}) are removed.
#'
#' Then, pseudo-strains are built if requested (the default). So far, the only pseudo-strain defined is "total", which
#' is the total counts per well of non-spike-in strains.
#'
#' If defined, annotations (like experimental meta-data) are loaded from \code{annotation_filename}. This CSV file needs a
#' column name in common with \code{data_filename} (case sensitive) since the next step is a join on the shared columns.
#' Also, every observation in the input data that you want to keep must be annotated.
#'
#' Finally, it adds a \code{negative_control} column if necessary and a \code{positive_control} column if defined, and checks
#' that there are at least 2 negative control observations.
#' @param data_filename Character. Path to a table of counts.
#' @param annotation_filename Character. Optional. Path to experimental annotations. These will be joined to the input data
#' and used for batch correction.
#' @param output_path Character. Path to directory where you want the analysis output. This is also where
#' checkpoints and logs from cluster execution are stored. Default is current working directory.
#' @param controls Named list of Characters with elements \code{positive} and \code{negative}.
#' Adds logical columns to data called \code{positive_control} and \code{negative_control} based on regular expression
#' matches.
#' @param test Logical. Run in test mode? If so, only reads the first 5 million lines of \code{data_filename}.
#' @param checkpoint Logical. Save intermediate results as checkpoints?
#' @param threshold Numeric. Strains below this total count threshold will be discarded. Default \code{1000}.
#' Plates below 1000 x \code{threshold} will be discarded. Set to \code{0} to skip this.
#' @param spike_in Character. A regular expression to match spike-in controls.
#' @param pseudostrains Logical. Make pseudostrains such as "total", which is the sum of all non-spike-ins.
#' @return list of class "concensusDataSet"
#' @seealso \link{newWorkflow}, \link{pipeline}, \link{concensusDataSetFromFile}
#' @import workflows
#' @importFrom magrittr %>%
#' @export
concensusDataSetFromFile <- function(data_filename, annotation_filename=NULL, output_path='.',
controls=NULL, rename=NULL, test=FALSE, checkpoint=FALSE,
threshold=100, spike_in='^intcon',
pseudostrains=TRUE,
...) {
println('Loading dataset from', data_filename, '...')
if ( test ) data_ <- readr::read_csv(data_filename, progress=TRUE, n_max=5e6, col_types=readr::cols(concentration=readr::col_double()))
else data_ <- readr::read_csv(data_filename, progress=TRUE, col_types=readr::cols(concentration=readr::col_double()))
check_headers(data_, essential_headers=c('compound', 'concentration', 'strain', 'plate_name', 'count'))
if ( ! 'well' %in% names(data_) & all(c('row', 'column') %in% names(data_)) ) {
data_ <- data_ %>% dplyr::mutate(well=paste0(row, column))
} else if ( ! 'well' %in% names(data_) ) {
stop('Column "well" or both "row" and "column" must be present in data file.\n')
}
original_columns <- names(data_)
if( ! ('negative_control' %in% original_columns) & is.null(controls) ) {
stop('Negative control column absent or controls not supplied.\n')
}
data_ <- clean(data_, threshold=threshold)
if ( pseudostrains ) {
println('Building pseudostrains...')
pseudostrain_total <- data_ %>%
dplyr::filter(!grepl(spike_in, strain)) %>%
dplyr::group_by(id, plate_name, well) %>%
dplyr::summarize(count=sum(count)) %>%
dplyr::ungroup() %>%
dplyr::mutate(strain='pseudostrain_total') %>%
dplyr::left_join(data_ %>% dplyr::ungroup() %>% dplyr::select(-count, -strain) %>% dplyr::distinct()) %>%
dplyr::ungroup() %>%
dplyr::distinct()
stopifnot(length(setdiff(names(pseudostrain_total), names(data_))) == 0)
stopifnot(nrow(pseudostrain_total) == nrow(data_ %>% dplyr::filter(strain == get_unique_values(data_, 'strain')[1])))
data_ <- dplyr::bind_rows(data_, pseudostrain_total)
}
if ( ! is.null(annotation_filename) ) {
println('Loading annotations from', annotation_filename, '...')
annotations <- readr::read_csv(annotation_filename)
}
annotation_columns <- names(annotations)
if ( ! is.null(annotation_filename) ) {
intersection <- intersect(names(data_), annotation_columns)
if ( length(intersection) == 0 ) stop('No columns in common between data and annotations.\n')
data_ <- data_ %>% dplyr::inner_join(annotations, by=intersection)
}
if ( ! is.null(controls) ) {
if ( ! 'negative' %in% names(controls) ) stop('List of controls must include an element named "negative".\n')
println('Assigning negative controls based on compound matching "', controls$negative, '"')
data_ <- data_ %>%
dplyr::mutate(negative_control=grepl(controls$negative, compound))
if ( 'positive' %in% names(controls) ) {
if ( ! is.null(controls$positive) ) {
println('Assigning positive controls based on compound matching "', controls$positive, '"')
data_ <- data_ %>%
dplyr::mutate(positive_control=grepl(controls$positive, compound))
}
}
# check at least some negative controls are present
n_negative_controls <- sum(getElement(data_, 'negative_control'))
n_positive_controls <- sum(getElement(data_, 'positive_control'))
println('There are', n_negative_controls, 'negative controls')
println('There are', n_positive_controls, 'positive controls')
if ( n_negative_controls < 2 ) {
stop('Not enough negative controls. Only ', n_negative_controls, 'present; you need at least 2.\n')
}
}
output_prefix <- file.path(output_path, strsplit(basename(data_filename), '.csv', fixed=TRUE)[[1]][1])
concensus_dataset <- newConcensusDataSet(data_filename=data_filename,
annotation_filename=annotation_filename,
date_created=date(),
data=data_,
data_columns=original_columns,
annotation_columns=annotation_columns,
controls=controls,
n_datapoints=nrow(data_),
n_compounds=length(get_unique_values(data_, 'compound')),
n_strains=length(get_unique_values(data_, 'strain')),
n_plates=length(get_unique_values(data_, 'plate_name')),
n_lanes=length(get_unique_values(data_, 'id')),
n_reads=sum(as.numeric(column2vector(data_, 'count'))),
output_prefix=output_prefix,
checkpoint=checkpoint,
working_directory=output_path)
return ( concensus_dataset )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.