misterplotR: Functions for plotting and clustering of count data

Documented in SEconstructFromCPM

#' Function to accept count data matrix and produce Z-scores, optionally averaged by replciates groups,
#' eventually returned as SummarizedExperiment object.
#' 
#' The function accepts a standard numeric count matrix or data frame. Will then optionally filter these counts
#' either for overlap with a provided GRanges object (Type = "ranged") or for overlap with a list of gene or region names 
#' (Type = "unranged"). Function can optionally filter for certain samples or average samples by groups depending on the GroupIndicator
#' which could e.g. be "_rep", so WT_rep1, WT_rep2, KO_rep1, KO_rep2 => WT and KO were the groups here.
#' Eventually will return the averaged counts (or optionally Z-scores) as individual assays in a (R)SE object.
#' 
#' @param InputData a matrix or dataframe with the count data to be plotted
#' @param Type "ranged" for GRanges input or "unranged" when a "Gene" column is present to be used as rowData
#' @param Reference.Regions GRanges object to use for overlap filtering
#' @param Reference.Genes Gene (or any name) list to filter for overlaps
#' @param GroupIndicator the suffix that indicates replicates, e.g. "_rep"
#' @param GroupFilter a list of group names to be retained
#' @param SampleFilter a list of samples to be retained
#' @param transform.log2 logical, whether to transform counts to log2+1
#' @param AverageByGroup logical, whether to average samples by group
#' @param Zscore logical, whether to calculate Z score and return as assays(SE)$Z
#'
#' @details (tba...)
#' 
#' @author Alexander Toenges
#' 
#' @examples 
#' ## Unranged:
#' cts <- sapply(seq(1,10), function(x) rnorm(1000,100))
#' colnames(cts) <- paste0("sample", "_rep", seq(1,10))
#' cts <- data.frame(Gene=paste0("gene", seq(1,1000)), cts)
#' SE <- SEconstructFromCPM(InputData = cts, Type = "unranged", AverageByGroup = TRUE, GroupIndicator = "_rep")
#' ## Ranged:
#' cts1 <- sapply(seq(1,10), function(x) rnorm(1000,100))
#' Gr <- GRanges(seqnames = rep("chr1", 1000), ranges = IRanges(start = seq(1,250), end = rep(1001, 1000)))
#' elementMetadata(Gr) <- cts1
#' RSE <- SEconstructFromCPM(InputData = Gr, Type = "ranged", AverageByGroup = TRUE, GroupIndicator = "_rep")
#' @export
SEconstructFromCPM <- function(InputData,                ## CPM data.frame with ranges or genes
                               Type,                     ## c("ranged", "unranged")
                               
                               Reference.Regions = NULL, ## GRanges with regions to filter for
                               Reference.Genes = NULL,   ## list of genes to filter for
                               GroupIndicator = NULL,    ## if one wants group information, split at this seplimiter
                               GroupFilter = NULL,       ## only keep those groups (GroupIndicator as indicator) 
                               SampleFilter = NULL,      ## only keep these samples
                               transform.log2 = FALSE,   ## guess what
                               AverageByGroup = FALSE,   ## guess what 
                               Zscore = TRUE             ## whether to append Z-scored values as assay
                               )
                               
  
{
  ###########################################################################################################
  
  require(SummarizedExperiment, quietly = TRUE)
  
  ###########################################################################################################
  
  if(AverageByGroup & is.null(GroupIndicator)) stop("AverageByGroup requires GroupIndicator")
  
  if(Type != "ranged" & Type != "unranged") stop("Type must be <ranged> or <unranged>", call. = FALSE)
  
  ###########################################################################################################
  
  ## RangedSummarizedExperiment:
  
  if(Type == "ranged"){
    
    eMetadata <- data.frame(elementMetadata(InputData))
    
    ## Check if metadata are present:
    if(ncol(eMetadata) == 0) stop("Ranged InputData do not contain elementMetadata!")
    
    ## check if any non-numerics
    if(sum(!sapply(eMetadata, is.numeric)) > 0){
      stop("InputData contains non-numeric data in the count columns")
    }
    
    ## check if Reference.Regions are GRanges:
    if(!is.null(Reference.Regions)){
      if(class(Reference.Regions) != "GRanges") stop("Reference.Regions must be GRanges object")
    }
    
    ## construct RSE:
    colnms <- colnames(eMetadata)
    
    ## colData definition using simply the sample names
    colData <- data.frame(Samples = colnms)
    
    ## optionally add replicate group indicators
    if(!is.null(GroupIndicator)){
      Groups <- sapply(strsplit(colnms, split= GroupIndicator), function(x) x[1])
      colData <- cbind(colData, data.frame(Groups=Groups))
    } 
      
    SummExp <- SummarizedExperiment(assays=SimpleList(counts = elementMetadata(InputData)),
                                    rowRanges = InputData[,0],
                                    colData = colData)
    
    if(class(SummExp) != "RangedSummarizedExperiment") stop("Could not construct RSE object from ranged InputData!")
    
    ## optionally subset with Reference.Regions
    if(!is.null(Reference.Regions)) SummExp <- subsetByOverlaps(SummExp, Reference.Regions)

    } 
    
    ## SummarizedExperiment (unranged, with gene names as rowData):
  
    if(Type == "unranged"){
      
      colnms <- colnames(InputData)
      
      if(length(which(colnms == "Gene")) == 0) stop("Mandatory <Gene> column is missing in InputData!")
      
      geneCol <- which(colnms == "Gene")
      countCols <- which(colnms != "Gene")
      
      ## colData definition using simply the sample names
      colData <- data.frame(Samples = colnms[countCols])
      
      ## optionally add replicate group indicators
      if(!is.null(GroupIndicator)){
        Groups <- sapply(strsplit(colnms[countCols], split= GroupIndicator), function(x) x[1])
        colData <- cbind(colData, data.frame(Groups=Groups))
      } 
      
      SummExp <- SummarizedExperiment(assays  = SimpleList(counts = as.matrix(InputData[,countCols])),
                                   colData = colData,
                                   rowData = data.frame(Gene=InputData[,geneCol]))
      
      if(class(SummExp) != "SummarizedExperiment") stop("Could not construct SE object from gene-level InputData")
      
      ## optionally filter for Reference.Genes
      if(!is.null(Reference.Genes)){
        
          tmp.isect <- intersect(x = rowData(SummExp)$Gene, y = Reference.Genes)
          
          if( length(tmp.isect) == 0){
            stop("No overlaps between gene names of InputData and Reference.Genes")
          }
          
          message(length(tmp.isect), " of ", length(Reference.Genes), 
                  " genes from Reference.Genes found in InputData")
          
          SummExp <- SummExp[unlist(rowData(SummExp) %in% Reference.Genes),]
          
        } 
    }
  
  ###########################################################################################################
  
  ## Optionally log2-transform:
  if(transform.log2) assays(SummExp)$counts <- log2(as.matrix(assays(SummExp)$counts)+1)
  
  ###########################################################################################################
  ## Optionally filter for certain samples:
  if(!is.null(GroupFilter)){
   
     tmp.which <- unlist(lapply(GroupFilter, function(x)which(SummExp$Groups==x)))
    
     ## print a warning when if not all desired group names could be found back in SummExp:
     if(length(tmp.which) < length(unique(SummExp$Groups[tmp.which]))){
       
      message("[Warning] Only found ", length(unique(SummExp$Groups[tmp.which])), 
              " of ", length(GroupFilter),
              " group names in the data. Check correct spelling!")
       
    }
     SummExp <- SummExp[,tmp.which]
  }
    
    ## Same on sample name level:
  if(!is.null(SampleFilter)){
    
    tmp.which <- unlist(lapply(SampleFilter, function(x)which(SummExp$Samples==x)))
    
    ## print a warning when if not all desired group names could be found back in SummExp:
    if(length(tmp.which) < length(unique(SummExp$Samples[tmp.which]))){
      
      message("[Warning] Only found ", length(unique(SummExp$Samples[tmp.which])), 
              " of ", length(SampleFilter),
              " group names in the data. Check correct spelling!")
      
    }
    SummExp <- SummExp[,tmp.which]
  }
  
  ## optionally average by group:
  if(AverageByGroup){
    
    averaged <- 
    do.call(cbind,
            lapply(unique(SummExp$Groups), function(x){
              tmp.grep <- grep(paste0("^",x,"$"), SummExp$Groups)
              if(length(tmp.grep) > 1 )  return(rowMeans2(as.matrix(assays(SummExp)$counts)[,tmp.grep]))
              if(length(tmp.grep) == 1 ) return(as.matrix(assays(SummExp)$counts)[,tmp.grep])
            }))
    colnames(averaged) <- unique(as.character(SummExp$Groups))
    
    SummExp1 <- SummarizedExperiment(assays    = SimpleList(counts = averaged),
                                     #rowRanges = rowRanges(SummExp),
                                     #rowData   = rowData(SummExp),
                                     colData   = data.frame(Samples = colnames(averaged)))
    
    if(Type == "ranged")    rowRanges(SummExp1) <- rowRanges(SummExp)
    if(Type == "unranged")  rowData(SummExp1)   <- rowData(SummExp)
    
    To.Return <- SummExp1
  } else To.Return <- SummExp
  
  if(Zscore) assays(To.Return)$Z <- as.matrix(t(scale(t(as.matrix(assay(To.Return))))))
  
  ## -- ## -- ## -- ## 
  return(To.Return)
  ## -- ## -- ## -- ## 
}