#' Function to accept count data matrix and produce Z-scores, optionally averaged by replciates groups,
#' eventually returned as SummarizedExperiment object.
#'
#' The function accepts a standard numeric count matrix or data frame. Will then optionally filter these counts
#' either for overlap with a provided GRanges object (Type = "ranged") or for overlap with a list of gene or region names
#' (Type = "unranged"). Function can optionally filter for certain samples or average samples by groups depending on the GroupIndicator
#' which could e.g. be "_rep", so WT_rep1, WT_rep2, KO_rep1, KO_rep2 => WT and KO were the groups here.
#' Eventually will return the averaged counts (or optionally Z-scores) as individual assays in a (R)SE object.
#'
#' @param InputData a matrix or dataframe with the count data to be plotted
#' @param Type "ranged" for GRanges input or "unranged" when a "Gene" column is present to be used as rowData
#' @param Reference.Regions GRanges object to use for overlap filtering
#' @param Reference.Genes Gene (or any name) list to filter for overlaps
#' @param GroupIndicator the suffix that indicates replicates, e.g. "_rep"
#' @param GroupFilter a list of group names to be retained
#' @param SampleFilter a list of samples to be retained
#' @param transform.log2 logical, whether to transform counts to log2+1
#' @param AverageByGroup logical, whether to average samples by group
#' @param Zscore logical, whether to calculate Z score and return as assays(SE)$Z
#'
#' @details (tba...)
#'
#' @author Alexander Toenges
#'
#' @examples
#' ## Unranged:
#' cts <- sapply(seq(1,10), function(x) rnorm(1000,100))
#' colnames(cts) <- paste0("sample", "_rep", seq(1,10))
#' cts <- data.frame(Gene=paste0("gene", seq(1,1000)), cts)
#' SE <- SEconstructFromCPM(InputData = cts, Type = "unranged", AverageByGroup = TRUE, GroupIndicator = "_rep")
#' ## Ranged:
#' cts1 <- sapply(seq(1,10), function(x) rnorm(1000,100))
#' Gr <- GRanges(seqnames = rep("chr1", 1000), ranges = IRanges(start = seq(1,250), end = rep(1001, 1000)))
#' elementMetadata(Gr) <- cts1
#' RSE <- SEconstructFromCPM(InputData = Gr, Type = "ranged", AverageByGroup = TRUE, GroupIndicator = "_rep")
#' @export
SEconstructFromCPM <- function(InputData, ## CPM data.frame with ranges or genes
Type, ## c("ranged", "unranged")
Reference.Regions = NULL, ## GRanges with regions to filter for
Reference.Genes = NULL, ## list of genes to filter for
GroupIndicator = NULL, ## if one wants group information, split at this seplimiter
GroupFilter = NULL, ## only keep those groups (GroupIndicator as indicator)
SampleFilter = NULL, ## only keep these samples
transform.log2 = FALSE, ## guess what
AverageByGroup = FALSE, ## guess what
Zscore = TRUE ## whether to append Z-scored values as assay
)
{
###########################################################################################################
require(SummarizedExperiment, quietly = TRUE)
###########################################################################################################
if(AverageByGroup & is.null(GroupIndicator)) stop("AverageByGroup requires GroupIndicator")
if(Type != "ranged" & Type != "unranged") stop("Type must be <ranged> or <unranged>", call. = FALSE)
###########################################################################################################
## RangedSummarizedExperiment:
if(Type == "ranged"){
eMetadata <- data.frame(elementMetadata(InputData))
## Check if metadata are present:
if(ncol(eMetadata) == 0) stop("Ranged InputData do not contain elementMetadata!")
## check if any non-numerics
if(sum(!sapply(eMetadata, is.numeric)) > 0){
stop("InputData contains non-numeric data in the count columns")
}
## check if Reference.Regions are GRanges:
if(!is.null(Reference.Regions)){
if(class(Reference.Regions) != "GRanges") stop("Reference.Regions must be GRanges object")
}
## construct RSE:
colnms <- colnames(eMetadata)
## colData definition using simply the sample names
colData <- data.frame(Samples = colnms)
## optionally add replicate group indicators
if(!is.null(GroupIndicator)){
Groups <- sapply(strsplit(colnms, split= GroupIndicator), function(x) x[1])
colData <- cbind(colData, data.frame(Groups=Groups))
}
SummExp <- SummarizedExperiment(assays=SimpleList(counts = elementMetadata(InputData)),
rowRanges = InputData[,0],
colData = colData)
if(class(SummExp) != "RangedSummarizedExperiment") stop("Could not construct RSE object from ranged InputData!")
## optionally subset with Reference.Regions
if(!is.null(Reference.Regions)) SummExp <- subsetByOverlaps(SummExp, Reference.Regions)
}
## SummarizedExperiment (unranged, with gene names as rowData):
if(Type == "unranged"){
colnms <- colnames(InputData)
if(length(which(colnms == "Gene")) == 0) stop("Mandatory <Gene> column is missing in InputData!")
geneCol <- which(colnms == "Gene")
countCols <- which(colnms != "Gene")
## colData definition using simply the sample names
colData <- data.frame(Samples = colnms[countCols])
## optionally add replicate group indicators
if(!is.null(GroupIndicator)){
Groups <- sapply(strsplit(colnms[countCols], split= GroupIndicator), function(x) x[1])
colData <- cbind(colData, data.frame(Groups=Groups))
}
SummExp <- SummarizedExperiment(assays = SimpleList(counts = as.matrix(InputData[,countCols])),
colData = colData,
rowData = data.frame(Gene=InputData[,geneCol]))
if(class(SummExp) != "SummarizedExperiment") stop("Could not construct SE object from gene-level InputData")
## optionally filter for Reference.Genes
if(!is.null(Reference.Genes)){
tmp.isect <- intersect(x = rowData(SummExp)$Gene, y = Reference.Genes)
if( length(tmp.isect) == 0){
stop("No overlaps between gene names of InputData and Reference.Genes")
}
message(length(tmp.isect), " of ", length(Reference.Genes),
" genes from Reference.Genes found in InputData")
SummExp <- SummExp[unlist(rowData(SummExp) %in% Reference.Genes),]
}
}
###########################################################################################################
## Optionally log2-transform:
if(transform.log2) assays(SummExp)$counts <- log2(as.matrix(assays(SummExp)$counts)+1)
###########################################################################################################
## Optionally filter for certain samples:
if(!is.null(GroupFilter)){
tmp.which <- unlist(lapply(GroupFilter, function(x)which(SummExp$Groups==x)))
## print a warning when if not all desired group names could be found back in SummExp:
if(length(tmp.which) < length(unique(SummExp$Groups[tmp.which]))){
message("[Warning] Only found ", length(unique(SummExp$Groups[tmp.which])),
" of ", length(GroupFilter),
" group names in the data. Check correct spelling!")
}
SummExp <- SummExp[,tmp.which]
}
## Same on sample name level:
if(!is.null(SampleFilter)){
tmp.which <- unlist(lapply(SampleFilter, function(x)which(SummExp$Samples==x)))
## print a warning when if not all desired group names could be found back in SummExp:
if(length(tmp.which) < length(unique(SummExp$Samples[tmp.which]))){
message("[Warning] Only found ", length(unique(SummExp$Samples[tmp.which])),
" of ", length(SampleFilter),
" group names in the data. Check correct spelling!")
}
SummExp <- SummExp[,tmp.which]
}
## optionally average by group:
if(AverageByGroup){
averaged <-
do.call(cbind,
lapply(unique(SummExp$Groups), function(x){
tmp.grep <- grep(paste0("^",x,"$"), SummExp$Groups)
if(length(tmp.grep) > 1 ) return(rowMeans2(as.matrix(assays(SummExp)$counts)[,tmp.grep]))
if(length(tmp.grep) == 1 ) return(as.matrix(assays(SummExp)$counts)[,tmp.grep])
}))
colnames(averaged) <- unique(as.character(SummExp$Groups))
SummExp1 <- SummarizedExperiment(assays = SimpleList(counts = averaged),
#rowRanges = rowRanges(SummExp),
#rowData = rowData(SummExp),
colData = data.frame(Samples = colnames(averaged)))
if(Type == "ranged") rowRanges(SummExp1) <- rowRanges(SummExp)
if(Type == "unranged") rowData(SummExp1) <- rowData(SummExp)
To.Return <- SummExp1
} else To.Return <- SummExp
if(Zscore) assays(To.Return)$Z <- as.matrix(t(scale(t(as.matrix(assay(To.Return))))))
## -- ## -- ## -- ##
return(To.Return)
## -- ## -- ## -- ##
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.