#' Preprocess PSI or cRPKM data frame using configuration file
#'
#' \code{preprocess_sample_colors} re-orders PSI or cRPKM sample columns to a
#' specified order, and it defines sample pools and colors. Order and colors are
#' taken from a pre-defined tab-delimited file (see \code{details}).
#'
#' @details
#' \code{preprocess_sample_colors} depends on a pre-defined "sample inventory"
#' database file in tab-delimited format. This file is species-specific and
#' consists of five columns: (optional) \emph{Order}, \emph{SampleName}, \emph{SubroupName}
#' (optional), \emph{GroupName}, (optional) \emph{RColorCode}. The header is required in
#' the file. Order of the columns is flexible.
#'
#' For example:
#'
#' \preformatted{Order SampleName SubgroupName GroupName RColorCode
#' 1 Oocyte_a Oocyte EarlyDev firebrick4
#' 2 Oocyte_b Oocyte EarlyDev firebrick4
#' 3 Embr_4C_a Embr_4C EarlyDev firebrick4
#' 4 Embr_4C_b Embr_4C EarlyDev firebrick4
#' 5 ESC_CGR8 ESC ESC firebrick
#' etc..}
#'
#' where:
#' \itemize{
#' \item{Order: A specific ordering of the samples from left to right of the plot.}
#' \item{SampleName: Name of the sample. MUST match sample name in input table.}
#' \item{SubgroupName: Use to define sample pools that will be plotted in the
#' same data point (see \code{plot_event}, \code{plot_expr} and
#' \code{plot_multievent}).}
#' \item{GroupName: Use for plotting the average PSI of samples belonging
#' to the same group. Averages will be calculated from the individual samples,
#' not from the subgroups (to avoid overrepresentation of subgroups with fewer
#' samples).}
#' \item {RColorCode: Color name as specified by \code{\link{colors}} or hex
#' color code (\code{#RRGGBB}).}
#' }
#'
#' The \emph{SampleName} must match the column names in \code{data}. It is possible
#' for \code{config} to contain more samples than the \code{data}. In this case,
#' the extra samples will be ignored. It is also possible that
#' \code{config} contains only a subset of the samples in \code{data}. In this
#' case, only the samples specified in the \code{config} will be plotted and everything
#' else is ignored.
#'
#' If a \emph{SampleName} or \emph{SubgroupName} is matched to multiple groups,
#' only the first match will be used. Similarly, if a \emph{GroupName} is matched
#' to multiple \emph{RColorCodes}, the first one will be applied to all elements
#' in the group.
#'
#' To use the \emph{SubgroupNames} in \code{config}, \code{subg} must be set to
#' \code{TRUE} AND \code{config} must contain a \emph{SubgroupName} column. If any
#' of these conditions is not met, one subgroup will be created for each sample,
#' preserving their name and order, and overriding any subgroups in \code{config}.
#'
#' The colors in \code{config} can be overridden by specifiying \code{col}. This
#' was mainly added to support the \code{col} option provided by
#' \code{\link{plot_event}} -- particularly when \code{config} is not provided.
#'
#' This function is also used for formatting cRPKM input data by setting
#' \code{expr = TRUE}.
#'
#' @param data A \emph{n} x \emph{2*m+1} data frame of PSI and quality score values
#' where \emph{n} is the number of AS events and \emph{m} is the number of samples.
#' If \code{expr=TRUE}, a \emph{n} x \emph{m+1} data frame of cRPKM. In both cases,
#' the first column corresponds to the row metadata. Metadata column values must
#' be unique, duplicated values will be discarded.
#' @param config Filename of the configuration file for \code{data}. Also
#' accepts \emph{m} x \emph{4} data frame of the configuration file, or an \emph{m}
#' x \emph{5} data frame if the SubgroupName column is included.
#' @param subg Set to \code{TRUE} to define a subgroup structure using the
#' \code{SubgroupName} column in \code{config}. If \code{FALSE}, or if the file
#' does not contain this column, samples will not be subgrouped (a separate
#' subgroup will be defined for each sample, preserving the sample names). If
#' \code{FALSE} and the file contains a \code{SubgroupName} column, that column
#' will be ignored.
#' @param expr Set to \code{TRUE} if formatting a cRPKM table. Otherwise, \code{FALSE}.
#' @param col Vector of colors with length matching the number of samples. If
#' specified, this will override the color settings specified in \code{config}.
#' @param multi_col Vector of colors with length matching the number of rows in
#' \code{data}. If specified, this can be used to define the color corresponding
#' to each event in \code{plot_multievent()}
#' @return
#' A list containing:
#' \describe{
#' \item{data}{data frame of PSI/cRPKM values with columns re-ordered}
#' \item{qual}{data frame of quality scores with columns re-ordered. \code{NULL}
#' if \code{expr = TRUE}}
#' \item{sample_order}{data frame with the order corresponding to each sample
#' name}
#' \item{subgroup}{data frame with the subgroup corresponding to each sample. If
#' \code{subg=FALSE}, or \emph{SubgroupName} is not present in \code{config},
#' a subgroup is made for each sample, preserving sample names.}
#' \item{subgroup_order}{data frame with the order corresponding to each subgroup.}
#' \item{group}{data frame with the group corresponding to each subgroup}
#' \item{group_order}{data frame with the order and color corresponding to
#' each group.}
#' \item{multi_col}{if \code{multi_col} was specified, a data frame with the
#' color coresponding to each event/gene ID. Else, \code{NULL}.}
#' \item{config}{if a \code{config} was supplied, config data frame summarising
#' the order-sample-subgroup-group-color relationships described in
#' \code{sample_order}, \code{subgroup}, \code{subgroup_order}, \code{group}
#' and \code{group_order}, after correcting for ambiguous relationships. If
#' \code{col} was supplied, colours in config are overridden with \code{col}. If
#' no \code{config} was supplied, one will be composed with default parameters.}
#' \item{original_config}{data frame with the \code{config} supplied to the function}
#' }
#' @seealso \code{\link{plot_event}}, \code{\link{plot_expr}},
#' \code{\link{plot_multievent}}
#' @export
#' @import dplyr
#' @import readr
#' @examples
#' #Tables from vast-tools need formatting before using this function
#' a <- format_table(psi)
#' reorderedpsi <- preprocess_sample_colors(a, config = config)
#'
#' b <- format_table(crpkm,expr=TRUE)
#' reorderedcrpkm <- preprocess_sample_colors(b, config = config, expr = TRUE)
#'
#' # Subgroups can be avoided even if the config file has them
#' reorderedpsi <- preprocess_sample_colors(a, config = config, subg = FALSE)
#'
#' # Mapping colours to events (e.g. for plotting with plot_multievent)
#' reorderedpsi <- preprocess_sample_colors(a[1:2,], config = config, multi_col = c("red","blue"))
preprocess_sample_colors <- function(data,
config,
subg = TRUE,
expr = FALSE,
col = NULL,
multi_col = NULL) {
if (colnames(data)[1] != "ID") {
stop("Input data may be invalid. Check that format_table() was executed first.")
}
R <- list()
N <- ifelse(expr, ncol(data) - 1, (ncol(data) - 1) / 2)
if (!is.null(col) && length(col) != N) {
stop("The length of col does not match the number of samples")
}
if(!is.null(multi_col) && length(multi_col) != nrow(data)){
stop("The length of multi_col does not match the number of events")
}
#Filter out non-unique events
if(length(unique(data$ID)) < length(data$ID)){
warning("More than one row with the same ID detected. Plotting only one row per ID")
multi_col <- multi_col[!duplicated(data$ID)]
data <- data %>% filter(!duplicated(data$ID))
}
#Defining event colours
#If in multi_event mode and no event colours provided, default ggplot2 will be used
if(!is.null(multi_col)){
multi_col <- tibble(ID=data$ID,
EventRColorCode=multi_col)
} else{
multi_col <- NULL
}
#Try to use config from argument
if(!is.null(config)){
if (is.character(config)) {
config <- read_tsv(config)
}
original_config <- config
# check input file [changed to allow flexible config column order, and subgroups]
if (!all(c("SampleName", "GroupName") %in% colnames(config))) {
stop("Incorrect formatting of headers in config")
}
if (!"Order" %in% colnames(config)) {
config$Order <- seq(1:nrow(config))
}
#To use the subgroups in the config, the column must be there and subg==T
#Otherwise, subgroups equal the individual samples.
subg <- all("SubgroupName" %in% colnames(config),subg==TRUE)
} else { #Make our own config and set subg to F
if (is.null(col)) { #If no colours, use default
col <- rep("black", N)
}
if (expr) {
data.new <- data[,-1]
qual.new <- NULL
} else {
data.new <- data[, seq(2, ncol(data), 2)]
qual.new <- data[, seq(3, ncol(data), 2)]
colnames(qual.new) <- colnames(data.new)
}
config <- data.frame("Order"=seq(1,N),
"SampleName"=names(data.new),
"SubgroupName"=names(data.new),
"GroupName" = names(data.new),
"RColorCode"=col,
stringsAsFactors = F)
subg <- FALSE
original_config <- NULL
}
#Override colors in config if col argument is there
if (!is.null(col)) {
config$RColorCode <- col
} else if (!"RColorCode" %in% colnames(config) || all(is.na(config$RColorCode))) {
Ngroups <- length(unique(config$GroupName))
col <- gg_color_hue(Ngroups)
names(col) <- unique(config$GroupName)
config$RColorCode <- col[config$GroupName]
}
# keep only tissue groups that are present in input data
# (to take into account samples that might have been excluded)
config <- config[config$SampleName %in% colnames(data)[-1],]
if (nrow(config) == 0) {
stop("No matching samples found in config. Are you using the correct config?")
}
# Re-order the PSI table
config <- config %>%
arrange(Order) %>%
mutate(Order=seq(1,nrow(config)))
new.column.idx <- sapply(config$SampleName,
function(x) which(colnames(data) == x))
data.new <- data[,new.column.idx]
#If subgroups are not enabled or they are not present in config,
#then make subgroups equal to samples.
if(subg==FALSE){
config <- config %>%
mutate(SubgroupName=SampleName)
}
sample_order <- config %>%
dplyr::select(Order,SampleName) %>%
dplyr::rename(SampleOrder="Order") %>%
as.data.frame()
subgroup_common <- config %>%
dplyr::select(Order,SampleName,SubgroupName) %>%
group_by(SubgroupName) %>%
dplyr::arrange(Order, .by_group=T)
subgroup <- subgroup_common %>%
ungroup() %>%
dplyr::select(SampleName,SubgroupName) %>%
as.data.frame()
subgroup_order <- subgroup_common %>%
dplyr::select(Order,SubgroupName) %>%
dplyr::summarise(SubgroupOrder=min(Order)) %>%
arrange(SubgroupOrder) %>%
mutate(SubgroupOrder=seq(1:nrow(.))) %>%
dplyr::select(SubgroupOrder,SubgroupName) %>%
as.data.frame()
group_common <- config %>%
dplyr::select(Order,SubgroupName,GroupName,RColorCode) %>%
group_by(SubgroupName) %>%
dplyr::arrange(Order,.by_group=T)
group <- group_common %>%
dplyr::select(Order,SubgroupName,GroupName) %>%
dplyr::summarise(GroupName=first(GroupName)) %>%
as.data.frame()
group_order <- group_common %>%
ungroup() %>%
group_by(GroupName) %>%
dplyr::select(Order,GroupName,RColorCode) %>%
dplyr::summarise(GroupOrder=min(Order),
RColorCode=first(RColorCode)) %>%
dplyr::select(GroupOrder,GroupName,RColorCode) %>%
dplyr::arrange(GroupOrder) %>%
mutate(GroupOrder=seq(1:nrow(.))) %>%
as.data.frame()
config <- sample_order %>%
left_join(subgroup,by="SampleName") %>%
left_join(subgroup_order,by="SubgroupName") %>%
left_join(group,by="SubgroupName") %>%
left_join(group_order,by="GroupName") %>%
dplyr::select(Order=SampleOrder,
SampleName,
SubgroupName,
GroupName,
RColorCode) %>%
as.data.frame()
if (expr) {
qual.new <- NULL
} else {
qual.new <- data[,new.column.idx + 1]
names(qual.new) <- names(data.new)
}
R <- list(data=data.new,
qual=qual.new,
sample_order=sample_order,
subgroup=subgroup,
subgroup_order=subgroup_order,
group=group,
group_order=group_order,
multi_col=multi_col,
config=config,
original_config=original_config)
return(R)
}
#' Simulate default ggplot2 color palette
#'
#' @details
#' \url{https://stackoverflow.com/a/8197703}
#'
#' @param n Number of colors
#' @return vector of colors of length n
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.