#' @export
#'
#' @importFrom BiocGenerics sort
#' @importFrom GenomeInfoDb sortSeqlevels
#' @importFrom GenomicRanges makeGRangesFromDataFrame reduce
#' @importFrom S4Vectors na.omit
#' @importFrom utils setTxtProgressBar txtProgressBar
#'
#' @title stats for the top windows in each region
#' @description given window resutls and normalized counts, combine significant overlapping windows into regions and
#' for each region, pick two candidate winodws:
#' \enumerate{
#' \item with highest log2FoldChange and
#' \item with highest normalized mean in treatment samples (see parameter \code{treatmentCols})
#' }
#' Return a data.frame with region information and stats, and for the selected windows, the following information:
#' \itemize{
#' \item \code{unique_id} of the window
#' \item start and end co-ordinates
#' \item log2FoldChange
#' \item normalized mean expression in treatment and control samples and
#' \item individual normalized expression in replicates
#' }
#'
#' @details
#' The output data.frame of this function has the following columns:
#' \itemize{
#' \item \code{chromosome}: chromosome name
#' \item \code{gene_id}: gene id
#' \item \code{gene_name}: gene name
#' \item \code{gene_region}: gene region
#' \item \code{gene_type}: gene type annotation
#' \item \code{regionStartId}: \code{unique_id} of the left most window, where a enriched region begins
#' \item \code{region_begin}: start position of the enriched region
#' \item \code{region_end}: end position of the enriched region
#' \item \code{region_length}: length of the enrched region
#' \item \code{strand}: strand info
#' \item \code{Nr_of_region}: number of the current region
#' \item \code{Total_nr_of_region}: total number of regions
#' \item \code{log2FoldChange_min}: min. log 2 fold change in the region
#' \item \code{log2FoldChange_mean}: average log 2 fold change in the region
#' \item \code{log2FoldChange_max}: max. log 2 fold change in the region
#' \item \code{unique_id.log2FCWindow}: unique_id of the window with largest log2FoldChange
#' \item \code{begin.log2FCWindow}: start position of the window with largest log2FoldChange
#' \item \code{end.log2FCWindow}: end of the window with largest log2FoldChange
#' \item \code{log2FoldChange.log2FCWindow}: log2FoldChange of the window with largest log2FoldChange
#' \item \code{treatmentName.mean.log2FCWindow}: mean of the normalized expression of the treatment samples for log2FCWindow, names in \code{treatmentCols} are used to calculate mean and treatmentName is from the parameter \code{treatmentName}
#' \item \code{controlName.mean.log2FCWindow}: mean of the normalized expression of the control samples for log2FCWindow, \code{colnames(normalizedCounts)} not found in \code{treatmentCols} are used to calculate mean and controlName is from the parameter \code{controlName}
#' \item the next columns will be normalized expression values of the log2FCWindow from individual treatment and control samples.
#' \item \code{unique_id.meanWindow}: unique_id of the window with largest mean in all treatment samples from \code{treatmentCols}
#' \item \code{begin.meanWindow}: start position of the mean window
#' \item \code{end.meanWindow}: end position of the mean window
#' \item \code{log2FoldChange.meanWindow}:log2FoldChange of the mean window
#' \item \code{treatmentName.mean.meanWindow}: mean of the normalized expression of the treatment samples for meanWindow, names in \code{treatmentCols} are used to calculate mean and treatmentName is from the parameter \code{treatmentName}
#' \item \code{controlName.mean.meanWindow}: mean of the normalized expression of the control samples for log2FCWindow, \code{colnames(normalizedCounts)} not found in \code{treatmentCols} are used to calculate mean and controlName is from the parameter \code{controlName}
#' \item the next columns will be normalized expression values of the meanWindow from individual treatment and control samples
#' }
#'
#' @param windowRes \code{data.frame}, output from \code{\link{resultsDEWSeq}}
#' @param padjCol \code{character}, name of the adjusted pvalue column (default: padj)
#' @param padjThresh \code{numeric}, threshold for p-adjusted value (default: 0.05)
#' @param log2FoldChangeCol \code{character}, name of the log2foldchange column (default: log2FoldChange)
#' @param log2FoldChangeThresh \code{numeric}, threshold for log2foldchange value (default:1)
#' @param start0based \code{logical}, TRUE (default) or FALSE. If TRUE, then the start positions in \code{windowRes} is considered to be 0-based
#' @param normalizedCounts \code{data.frame} or \code{matrix}, normalized read counts per window. \code{rownames(normalizedCounts)} and \code{unique_id} column from \code{windoeRes} must match
#' see \code{\link[DESeq2:counts]{counts}}, \code{\link[DESeq2:vst]{vst}} or \code{\link[DESeq2:rlog]{rlog}}
#' @param treatmentCols \code{character vector}, column names in \code{normalizedCounts} for treatment/case samples. The remaining columns in the data.frame will be considered control samples
#' @param treatmentName \code{character}, treatment name, see Details (default: treatment)
#' @param controlName \code{character}, control name, see Details (default: control)
#' @param op \code{character}, can be one of \code{max} (default) or \code{min}. \code{max} returns windows with maximum log2FoldChange and mean normalized expression in the \code{treatmentCols} columns,
#' \code{min} returns windows with minimum log2FoldChange and mean normalized expression
#'
#'
#' @examples
#'
#' data(slbpWindows)
#' data(slbpVst)
#' slbpList <- topWindowStats(slbpWindows,padjCol = 'pSlidingWindows.adj',
#' normalizedCounts = slbpVst, treatmentCols = c('IP1','IP2'),
#' treatmentName = 'SLBP',controlName = 'SMI')
#'
#' @return data.frame
topWindowStats <- function(windowRes,padjCol='padj',padjThresh=0.05,log2FoldChangeCol='log2FoldChange',log2FoldChangeThresh=1,start0based=TRUE, normalizedCounts,
treatmentCols,treatmentName='treatment',controlName='control', op='max'){
requiredCols <- c('chromosome','unique_id','begin','end','strand','gene_id','gene_name',
'gene_type','gene_region','Nr_of_region','Total_nr_of_region','window_number',padjCol,log2FoldChangeCol)
missingCols <- setdiff(requiredCols,colnames(windowRes))
if(length(missingCols)>0){
stop('Input data.frame is missing required columns, needed columns:
chromosome: chromosome name
unique_id: unique id of the window
begin: window start co-ordinate
end: window end co-ordinate
strand: strand
gene_id: gene id
gene_name: gene name
gene_type: gene type annotation
gene_region: gene region
Nr_of_region: number of the current region
Total_nr_of_region: total number of regions
window_number: window number
',padjCol,': p-adjusted value column
',log2FoldChangeCol,': log2foldchange column.
Missing columns: ',paste(missingCols,collapse=", "),'')
}
windowRes <- na.omit(windowRes[,requiredCols])
sigDat <- windowRes[ windowRes[,padjCol]<=padjThresh & windowRes[,log2FoldChangeCol]>=log2FoldChangeThresh, ]
rownames(sigDat) <- sigDat$unique_id
if(nrow(sigDat)==0){
stop('There are no significant windows/regions under the current threshold!\nPlease lower your significance cut-off thresholds and manually check if there are any significant windows under the threshold')
# return(NULL)
}
if (length(unique(treatmentCols))<length(treatmentCols)){
stop('There are repeating names in treatmentCols')
}
if(length(setdiff(treatmentCols,colnames(normalizedCounts)))>0){
stop('mismatch treatmentCols and column names of normalizedCounts!')
}
controlCols <- setdiff(colnames(normalizedCounts),treatmentCols)
op <- match.arg(op,c('min','max'),several.ok = FALSE)
callFn <- which.max
if(op=='min'){
callFn <- which.min
}
normalizedCounts <- as.data.frame(na.omit(normalizedCounts))
commonids <- intersect(rownames(normalizedCounts),rownames(sigDat))
if(length(commonids)==0){
stop('There are no common elements between significant windows/regions in under the current threshold and windows in normalizedCounts data.frame!\nPlease lower your significance cut-off thresholds and manually check if there are any significant windows under the threshold')
}
sigDat <- cbind(sigDat[commonids,],normalizedCounts[commonids,])
sigRange <- makeGRangesFromDataFrame(sigDat,seqnames.field = 'gene_id',start.field = 'begin',end.field = 'end',strand.field = 'strand',
ignore.strand=FALSE,starts.in.df.are.0based=start0based,keep.extra.columns = TRUE)
sigRange <- sortSeqlevels(sigRange)
sigRange <- sort(sigRange)
sigReduce <- reduce(sigRange,drop.empty.ranges=TRUE,with.revmap=TRUE,min.gapwidth=1)
if(nrow(mcols(sigReduce))<1){
stop('Cannot find overlapping windows (regions) in input results!')
}
log2FCWindowCols <- paste(colnames(normalizedCounts),'log2FCWindow',sep='.')
log2FCMean <- paste(c(treatmentName,controlName),'mean.log2FCWindow',sep='.')
meanWindowCols <- paste(colnames(normalizedCounts),'meanWindow',sep='.')
meanMean <- paste(c(treatmentName,controlName),'mean.meanWindow',sep='.')
outCols <- c(c('gene_id','region_begin','region_end','region_length','strand','regionStartId','chromosome','gene_name','gene_region','gene_type',
'Nr_of_region','Total_nr_of_region',paste0('min.',log2FoldChangeCol),paste0('mean.',log2FoldChangeCol),paste0('max.',log2FoldChangeCol),
'unique_id.log2FCWindow','begin.log2FCWindow','end.log2FCWindow',paste0(log2FoldChangeCol,'.log2FCWindow')),log2FCMean, log2FCWindowCols,
c('unique_id.meanWindow','begin.meanWindow','end.meanWindow',paste0(log2FoldChangeCol,'.meanWindow')),meanMean,meanWindowCols)
outDat <- cbind(as.data.frame(sigReduce)[,c(1:5)],data.frame(matrix(data=NA,nrow=nrow(mcols(sigReduce)),ncol = length(outCols)-5)))
colnames(outDat) <- outCols
# now fill out 0s for the mean window columns
outDat$unique_id.meanWindow <- 'same window'
outDat[,c('begin.meanWindow','end.meanWindow',meanMean,meanWindowCols)] <- 0
# mean, log2foldchange comparison data.frame
pb <- txtProgressBar(min=0,max=length(sigReduce),style = 3)
for(i in seq_len(length(sigReduce))){
mergeInd <- sigReduce$revmap[[i]]
outDat[i,'regionStartId'] <- mcols(sigRange)[min(mergeInd),'unique_id']
outDat[i,'regionStartId'] <- mcols(sigRange)[min(mergeInd),'unique_id']
outDat[i,'chromosome'] <- mcols(sigRange)[min(mergeInd),'chromosome']
outDat[i,'gene_name'] <- mcols(sigRange)[min(mergeInd),'gene_name']
outDat[i,'gene_type'] <- mcols(sigRange)[min(mergeInd),'gene_type']
outDat[i,'gene_region'] <- mcols(sigRange)[min(mergeInd),'gene_region']
outDat[i,'gene_type'] <- mcols(sigRange)[min(mergeInd),'gene_type']
outDat[i,'Nr_of_region'] <- mcols(sigRange)[min(mergeInd),'Nr_of_region']
outDat[i,'Total_nr_of_region'] <- mcols(sigRange)[min(mergeInd),'Total_nr_of_region']
#get the window with minimum/maximum log2 Foldchange
# fill log2FoldChange first
outDat[i,paste0('min.',log2FoldChangeCol)] <- min(unlist(mcols(sigRange)[mergeInd,log2FoldChangeCol]))
outDat[i,paste0('mean.',log2FoldChangeCol)] <- mean(unlist(mcols(sigRange)[mergeInd,log2FoldChangeCol]))
outDat[i,paste0('max.',log2FoldChangeCol)] <- max(unlist(mcols(sigRange)[mergeInd,log2FoldChangeCol]))
log2FCInd <- callFn(mcols(sigRange)[mergeInd,log2FoldChangeCol])
outDat[i,'unique_id.log2FCWindow'] <- mcols(sigRange)[mergeInd[log2FCInd],'unique_id']
outDat[i,'begin.log2FCWindow'] <- start(sigRange[mergeInd[log2FCInd]])
outDat[i,'end.log2FCWindow'] <- end(sigRange[mergeInd[log2FCInd]])
outDat[i,paste0(log2FoldChangeCol,'.log2FCWindow')] <- mcols(sigRange)[mergeInd[log2FCInd],log2FoldChangeCol]
outDat[i,paste0(treatmentName,'.mean.log2FCWindow')] <- mean(unlist(mcols(sigRange)[mergeInd[log2FCInd],treatmentCols]))
outDat[i,paste0(controlName,'.mean.log2FCWindow')] <- mean(unlist(mcols(sigRange)[mergeInd[log2FCInd],controlCols]))
outDat[i,log2FCWindowCols] <- unlist(mcols(sigRange)[mergeInd[log2FCInd],colnames(normalizedCounts)])
# fill out log2FC window data
# get the window with minimum/maximum normalized count
meanInd <- callFn( rowMeans( as.data.frame(mcols(sigRange)[mergeInd,treatmentCols]) ) )
if(log2FCInd!=meanInd){
outDat[i,'unique_id.meanWindow'] <- mcols(sigRange)[mergeInd[meanInd],'unique_id']
}
outDat[i,'begin.meanWindow'] <- start(sigRange[mergeInd[meanInd]])
outDat[i,'end.meanWindow'] <- end(sigRange[mergeInd[meanInd]])
outDat[i,paste0(log2FoldChangeCol,'.meanWindow')] <- mcols(sigRange)[mergeInd[meanInd],log2FoldChangeCol]
outDat[i,paste0(treatmentName,'.mean.meanWindow')] <- mean(unlist(mcols(sigRange)[mergeInd[meanInd],treatmentCols]))
outDat[i,paste0(controlName,'.mean.meanWindow')] <- mean(unlist(mcols(sigRange)[mergeInd[meanInd],controlCols]))
outDat[i,meanWindowCols] <- unlist(mcols(sigRange)[mergeInd[meanInd],colnames(normalizedCounts)])
# fill out mean window data
# progress
setTxtProgressBar(pb=pb,value=i)
}
close(pb)
if(start0based){ # return 0 based start positions
outDat$region_begin <- outDat$region_begin-1
outDat$begin.log2FCWindow <- outDat$begin.log2FCWindow-1
outDat$begin.meanWindow <- pmax(outDat$begin.meanWindow-1,0)
}
colOrder <- c(c('chromosome','gene_id','gene_name','gene_region','gene_type','regionStartId','region_begin','region_end','region_length','strand','Nr_of_region','Total_nr_of_region',
paste0('min.',log2FoldChangeCol),paste0('mean.',log2FoldChangeCol),paste0('max.',log2FoldChangeCol),'unique_id.log2FCWindow','begin.log2FCWindow','end.log2FCWindow',
paste0(log2FoldChangeCol,'.log2FCWindow')),log2FCMean,log2FCWindowCols,c('unique_id.meanWindow','begin.meanWindow','end.meanWindow',paste0(log2FoldChangeCol,'.meanWindow')),
meanMean,meanWindowCols)
# sort main results
outDat <- outDat[order(outDat$chromosome,outDat$region_begin),] # sort as a separate step to fix
rownames(outDat) <- NULL # fix jumbled up rownames
return(outDat[,colOrder])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.