R/DataIntegRFunctions.R

#' Function that rearranges list of scores calculated per cell type
#' 
#' @param scores.per.cell.type list of result score matrices for each biwgig
#'  file
#' @param sampleIDs  A vector of unique sample ids/names of bigwig files 
#' @return a dataframe of quatified scores per genome window and bigwig files.
#' Each row corresponds to ranges in the genome, and columns correspond to the 
#' bigwig files. 
#' @keywords internal
#' @author Inga Patarcic
scoresAsMcols <- function(scores.per.cell.type,sampleIDs){
  
  
          mcols.per.cell.type <- plyr::rbind.fill(lapply(scores.per.cell.type,
                      function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
     
        # adjust column names - remove bw extension
        rownames(mcols.per.cell.type) <- sampleIDs
       
     return(t(mcols.per.cell.type))
}



#' Function that runs normalization for the quantified activity/expression level
#' 
#' @param scoresDF a dataframe of quatified scores per genome window and bigwig 
#'  file
#' @param normalizationMet  charater. which normalization procedure is wanted.
#' @return a normalized dataframe of quatified scores per genome window and 
#' bigwig files. 
#' @keywords internal
#' @author Inga Patarcic
normalizeScores <- function(scoresDF,normalizationMet){

    if (normalizationMet=="ratio") {   
            # estimate size factors and multiply gene expression using these 
            # factors
    sizeFactors <- DESeq2::estimateSizeFactorsForMatrix(as.matrix(scoresDF))
    scoresDF.normalized <- as.matrix(scoresDF)*rep(sizeFactors,each=
                                                            nrow(scoresDF))}
                
    if (normalizationMet=="quantile") {  
 scoresDF.normalized <- preprocessCore::normalize.quantiles(as.matrix(scoresDF))
          rownames(scoresDF.normalized) <- rownames(scoresDF)
          colnames(scoresDF.normalized) <- colnames(scoresDF)}
           
 
  return(scoresDF.normalized)

  
  }
  



#' Function that quantifies exon expression based on strand specificity
#' 
#'  Function that separates stranded and unstranded libraries and quantifies 
#'  exon expression separately for stranded libraries eg expression of exons 
#'  located on the + strand is quantified using forward libraries if libraries
#'  are unstranded then exon orientations is not taken into account
#'  
#' @param exons GRanges object with exon coordinates and metadata about genes
#' @param exons.splitted GRanges object with exon coordinates splitted based on
#' the strand.
#' @param target character, paths to bigwig files
#' @param libStrand character, library strandness ("+","-","*")
#' @param sampleIDs character, name of samples/bigwig files
#' @param summaryOperation argument for ScoreMatrixBin() how to quantify scores
#' @param mc.cores integer, number of cores for mclapply
#' @return Granges object with metadata which corresponds to quantified scores 
#' per input exon regions and bigwig files
#' @keywords internal
#' @author Inga Patarcic
exonExpressionStrandAdjusted <- function(exonsForQuant,
                                          exonsSplitted,
                                          target,
                                          sampleIDs,
                                          libStrand,
                                          mc.cores,
                                          summaryOperation) {
  
  #exonsForQuant <- exons
  #exonsSplitted <- exons.splitted

  
  if (length(exonsSplitted$`*`)!=0){stop("Strand of exons can be only + or -")}
  
        # detect stranded tracks and unstranded tracks    
        ForwardLibraries <- stringr::str_detect(libStrand,"\\+") 
        Unstranded.libraries <- stringr::str_detect(libStrand,"\\*")
        
  if (sum(ForwardLibraries)!=(sum(!(ForwardLibraries|Unstranded.libraries)))){
    stop("Number of forward and reverse libraries do not match")}
  
  
  # separate analysis for unstranded & stranded libraries
  
  if (sum(Unstranded.libraries)!=length(libStrand)){
    # test strandness - "mixed"&"stranded"
    # Stranded libraries - positive/forward strand
    
    # forward strand
    exonscoresForwardStrand <- mclapply(target[ForwardLibraries], 
                                        function(x) {try(
                                        genomation::ScoreMatrixBin(x,
                                              windows =  exonsSplitted$`+`,
                                              bin.num = 1,
                                              type = "bigWig",
                                              bin.op=summaryOperation, 
                                              is.noCovNA=F),
                                              silent = T)},
                                        mc.cores=mc.cores)
    
    if (max(sapply(exonscoresForwardStrand,class)=="try-error")){
      stop(paste("Error with ScoreMatrixBin and following bigwig files",
                 target[ForwardLibraries][sapply(
                   exonscoresForwardStrand,class)=="try-error"]))}                                      
    
    exonscoresForwardStrand.df <- scoresAsMcols(scores.per.cell.type=
                                                  exonscoresForwardStrand,
                                        sampleIDs=sampleIDs[ForwardLibraries])
    
    # reverse strand               
    exonscoresReverseStrand <- mclapply(target[!(ForwardLibraries|
                                                        Unstranded.libraries)], 
                                        function(x) {try(
                                          genomation::ScoreMatrixBin(x,
                                               windows =  exonsSplitted$`-`,
                                               bin.num = 1,
                                               type = "bigWig", 
                                               bin.op=summaryOperation,
                                               is.noCovNA=F),
                                               silent = T)},
                                        mc.cores=mc.cores)
    
    if (max(sapply(exonscoresReverseStrand,class)=="try-error")){
      stop(paste("This bigwig files do not overlap exon regions",
                 target[!(ForwardLibraries|Unstranded.libraries)]
                 [sapply(exonscoresReverseStrand,class)=="try-error"]))}
    
    exonscoresReverseStrand.df <- abs(scoresAsMcols(scores.per.cell.type=
                                                      exonscoresReverseStrand,
                                        sampleIDs=sampleIDs[ForwardLibraries]))
    
    
    # test if exon length is equal to the length of splitted exons
    if (length(exonsForQuant)!=sum(c(nrow(exonscoresForwardStrand.df),
                             nrow(exonscoresReverseStrand.df)))) {stop("Exons
                                    and splitted exon object are not equal")}
    
    
    # pool reverse and forward strand together    
    
    if (any(c(exonsSplitted$`-`$name,exonsSplitted$`+`$name)!=
            exonsForQuant$name)){break("Exons strand after quantification not correct")}
    
    
    DF_exons <- rbind(exonscoresReverseStrand.df,
                      exonscoresForwardStrand.df)
        row.names(DF_exons) <- NULL # adjusting for problem of equal row.names
        
        values(exonsForQuant) <- cbind(rbind(mcols(exonsSplitted$`-`),
                                             mcols(exonsSplitted$`+`)),
                                   DataFrame(DF_exons))
    
        
        
    }
  
  if (sum(Unstranded.libraries)!=0){
    
    
    # Unstranded
    exonscoresUnstranded <- mclapply(target[Unstranded.libraries], 
                                     function(x){try(
                                       genomation::ScoreMatrixBin(x,
                                                      windows = exonsForQuant,
                                                      bin.num = 1,
                                                      type = "bigWig",
                                                      bin.op=summaryOperation,
                                                      is.noCovNA=F),
                                                    silent = T)},
                                     mc.cores=mc.cores)
    
    if (max(sapply(exonscoresUnstranded,class)=="try-error")){
      stop(paste("This bigwig files do not overlap exon regions",
                 target[Unstranded.libraries]
                 [sapply(exonscoresUnstranded,class)=="try-error"]))}                                      
    
    
    exonscoresUnstranded.df <- scoresAsMcols(scores.per.cell.type=
                                               exonscoresUnstranded,
                                    sampleIDs=sampleIDs[Unstranded.libraries])                  
    
    values(exonsForQuant) <- cbind(mcols(exonsForQuant),
                                  DataFrame(exonscoresUnstranded.df))
    
    
  }
  
  return(exonsForQuant)                
}



#' Function that quantifies gene expression based on prequantified exon exp
#' 
#' Function that calculates per gene expression based on exon expression results
#' sum across exons (mean value across exon*exon width)/
#' gene_length(sum_of_all_exon_lengths)
#'  
#' @param expressionPerGene GRanges object with exon coordinates and 
#' metadata info about corresponding genes and quantifed exon expression
#' @return Granges object of genes with metadata which corresponds to quantified
#' gene expression
#' @keywords internal
#' @author Inga Patarcic
quantifyGeneExpression <- function(expressionPerGene,exonsForGene){
  
  ExonLength <- width(expressionPerGene)
  GeneLength <- sum(ExonLength,na.rm=T)
  ExpressionPerSample=as.data.frame(mcols(expressionPerGene)[-(1:exonsForGene)])
        
    GeneExpression <- apply(ExpressionPerSample*ExonLength,
                            2,sum,na.rm=T)/GeneLength
  
      return(GeneExpression)
}




#' Calculates regulatory activity over pre-defined regions
#' 
#' The function calculates regulatory activity from histone
#' modification, DNAse or methylation signals for pre-defined regulatory
#' regions and returns a GRanges object with regulatory region locations
#' and their activity over a set of samples.
#' 
#' 
#' @param windows a GRanges object that contains regulatory regions
#' over which the regulatory activity will be calculated. 
#' It is strongly suggested to adjust 
#' seqlengths of this object to be equal to the seqlenghts Hsapiens from the 
#' appropriate package (BSgenome.Hsapiens.UCSC.hg19 or whatever needed version).
#' @param target a named list of BigWig files. Names correspond to 
#'        unique sample ids/names.
#' @param sampleIDs NULL (default). A vector of unique sample
#' ids/names(.bw files), ordered as the bigwig files are ordered. When NULL
#' basenames of .bw files is used as a unique sample ids/names.
#' @param isCovNA (def:FALSE), if this is set to TRUE, uncovered
#' bases are set to NA, this is important when dealing with methylation
#' data, where uncovered bases on a bigWig file do not mean zero methylation.
#' @param weightCol a numeric column in meta data part used as weights. Useful
#' when genomic regions have scores other than their coverage values, 
#' such as percent methylation, conservation scores, GC content, etc.
#' @param normalize NULL(default). Optional "quantile" and "ratio"
#' If set to "quantile" activity measures are quantile normalized as 
#' implemented in \code{\link[preprocessCore]{normalize.quantiles}} and
#' returned ; if set to "ratio" then "median ratio method" implemented
#' as \code{\link[DESeq2]{estimateSizeFactorsForMatrix}} is used to normalize 
#' activity measures.
#' @param summaryOperation "mean"(default).  An argument for 
#' \code{\link[genomation]{ScoreMatrixBin}} that is in the nutshell of 
#' quantifying enhancer activity across pre-defined enhancer regions.
#' This designates which summary operation should be used over the regions.
#' Currently, only mean is available, but "median" or "sum" will be implemented
#' in the future.
#' @param mc.cores (def:1) Define the number of cores to use;
#' at most how many child processes will be run simultaneously 
#' using mclapply from parallel package. Parallelization requires at 
#' least two cores.
#' 
#' @return a GRanges object where its meta-columns correspond
#'         to calculated acvitity measures and column names 
#'         correspond to provided sample ids or names.
#' 
#' 
#' @details regulatory activity is measured by averaging logFC for
#' histone modification ChIP-seq profiles, or DNAse signal, or methylation
#' per base.Currently, relevant bigWig files are required to calculate activity       
#' activity. This function might be extended to work with BAM files
#' in the future. 
#' 
#' @examples # INPUT1: defining .bw files:
#' 
#' test.bw <- system.file("extdata", "test.bw",package = "reg2gene")
#' test2.bw <- system.file("extdata", "test2.bw",package = "reg2gene")
#' 
#' # INPUT2: defining regulatory regions:windows
#' 
#' windows <- GRanges(c(rep("chr1",4),rep("chr2",2)),
#'                       IRanges(c(1,7,9,15,1,15),c(4,8,14,20,4,20)),
#'                                             c(rep("+",3),rep("-",3)))
#' windows$reg <-  windows[c(1,1,3:6)]
#' windows$name2 <- windows$name <- paste0("TEST_Reg",
#'                                         c(1,1,3:length(windows)))
#'                         
#'  # OUTPUT  regActivity():                                                      
#' regActivity(windows,c(test.bw,test2.bw))   
#' 
#' # Additionaly, sample names can changed with sampleIDs argument
#' 
#' regActivity(windows,c(test.bw,test2.bw),sampleIDs=c("Cell1","Cell2"))
#' 
#' # Additionaly, it supports different normalization procedures:
#' 
#' regActivity(windows,c(test.bw,test2.bw),normalize ="ratio")
#' regActivity(windows,c(test.bw,test2.bw),normalize ="quantile")
#' 
#' @export
regActivity <- function(windows,
                        target,
                        sampleIDs=NULL,
                        isCovNA=FALSE,
                        weightCol=NULL,
                        summaryOperation="mean",
                        normalize=NULL,
                        mc.cores=1){
  
  
       # test input - ranges
  if (!exists("windows")) {stop("windows object missing")}
      # test input - bigwig files
  if (!min(sapply(target,file.exists))) 
          {stop("target object missing")}
  if (is.null(sampleIDs)) {
      bw.exts = c(".bw",".bigWig",".bigwig",".BigWig", ".BIGWIG", ".BW")
      sampleIDs <- stringr::str_replace(basename(target),
                               paste(bw.exts,collapse="|"),"")}
          
          
  # calculating coverage
         
          scores.per.cell.type <- parallel::mclapply(target,
                                    genomation::ScoreMatrixBin,
                                           windows = windows,
                                           bin.num = 1,
                                           type = "bigWig", 
                                           is.noCovNA=isCovNA,
                                           weight.col=weightCol,
                                           bin.op=summaryOperation,
                                           mc.cores = mc.cores)
          
         
           # adding scores as mcols  
        scores.per.cell.type.df <- scoresAsMcols(scores.per.cell.type=
                            scores.per.cell.type,sampleIDs=sampleIDs)
                                                   
        
          
          # add normalization
      if (!is.null(normalize)){
        scores.per.cell.type.df <- normalizeScores(scores.per.cell.type.df,
                                                   normalize)}

          mcols(windows) <- scores.per.cell.type.df
           
          return(windows)
          
  
  
}

#' Identify regulatory regions around provided TSSes
#' 
#' The function identifies the regulatory regions around provided
#' TSSes over a pre-defined window. The function needs a GRanges
#' object for TSSes with meta-columns corresponding to expression
#' levels in different cell types or conditions. 
#' 
#' @param regActivity a GRanges object output from \code{regActivity}
#'        function
#' @param geneExpression a GRanges object output from \code{targetneExp} that
#' contains the TSS location and associated gene expression values 
#' per cell type or condition as meta data. Each row should have a
#' "name" and "name2" columns for unique id or name/symbol for the
#' gene which the TSS is associated with. One is Ensembl id 
#' and the other could be used for the gene symbol. Other metadata
#' column names should represent sample names/ids and should
#' match the GRanges object provided via regActivity argument.
#' @param upstream number of basepairs upstream from TSS to look for
#' regulatory regions. default 500kb
#' @param downstream number of basepairs downstream from TSS to look for 
#' regulatory regions. default 500kb
#' @param mc.cores (def:1) Define the number of cores to use;
#' at most how many child processes will be run simultaneously 
#' using mclapply from parallel package. Parallelization requires at 
#' least two cores.
#' @param force (DEFAULT: FALSE) This argument allows to test input enhancers
#' and genes in 1-to-1 manner. Meaning,a gene expression of gene1 is modelled 
#' using enhancer1 enhancer activity, gene2 gene expression is modelled with
#' enhancer2 enhancer activity, etc. Thus, input gene expression GRanges object
#' (geneExpression) needs to have equal length as enhancer activity GRanges object 
#' (regActivity). It overwrites upstream and downstream arguments since 1-to-1
#' relationship is tested. 
#' 
#' @return An output of regActivityAroundTSS() is a GRangesList object that 
#' contains per gene GRanges with location of the corresponding TSS and
#' regulatory regions identified around that gene. Names for the GRangesList 
#' are unique gene ids/names. Metadata for each GRanges object in the 
#' GRangesList represents regulatory activity and gene expression quantified 
#' across a number of samples (.bw files) that have matched IDs in RNA-Seq and
#' CHiP-Seq,DNase-Seq or bisulfite sequencing experiment. 
#' The GRanges objects have the following metadata columns:
#'  1. featureType: either "gene" or "regulatory"
#'  2. name: name/id for gene and enhancers. Gene name could be id from a 
#'  database enhancer name should be in the format as follows "chr:start-end"
#'  3. name2: a secondary name for the feature, such as gene symbol "PAX6" etc. 
#'  not necessary for enhancers could be NA
#'  4. other columns: numeric values for gene expression or regulatory actvity.
#'    Column names represent sample names/ids.
#' 
#' @examples  # INPUT1: CREATING Toy example for quantified gene expression:
#' 
#' geneExpression <- GRReg1_toy
#'   geneExpression$bw1 <- rep(1,length(GRReg1_toy))
#'   geneExpression$bw2 <- rep(2,length(GRReg1_toy))
#'   geneExpression$bw3 <- rep(3,length(GRReg1_toy))
#'   
#'   # INPUT2: CREATING TOY example for quantified enhancer activity:
#'   
#' regRegion <- GRReg2_toy
#'    regRegion$bw1 <- rep(3,length(regRegion))
#'    regRegion$bw2 <- rep(4,length(regRegion))
#' 
#' # Overlapping quantified enhancer activity and gene expression:
#' # OUTPUT  regActivityAroundTSS():
#' regActivityAroundTSS(regActivity=regRegion, geneExpression=geneExpression,
#' upstream=1,downstream=1)
#' 
#' # when different upstream/downstream argument is used:
#' 
#' regActivityAroundTSS(regRegion,geneExpression,upstream=5,downstream=5)
#' 
#' # force=T, forcing 1-to-1 relationship
#'  regActivityAroundTSS(regRegion[1:length(geneExpression)],
#'  geneExpression,upstream=5,downstream=5,force=TRUE)
#' 
#' @details only enhancers located within (+/-)upstream/downstream of TSS 
#' are identified,extracted and reported in output (together with info
#' about gene expression). Sample id's (corresponding to the cell types 
#' or conditions) are included only if both, 1) gene expression values and
#' 2) quantified regulatory activity are available in TSS and 
#' regActivity objects. Non-overlapping cell types are excluded.
#'  
#' 
#' 
#' @export
regActivityAroundTSS <- function(regActivity,
                                 geneExpression,
                                 upstream=500000,
                                 downstream=500000,
                                 mc.cores=1,
                                 force=FALSE){
  
  
  if (is.null(geneExpression$name)) {stop("geneExpression object does not contain info about gene 
                               name. geneExpression GRanges object should be 2nd arg")}
  
  
  # getting modelling data for testing 1-to-1 relationship
  
    if (force==TRUE){
    
    if (length(regActivity)!=length(geneExpression)){stop(
          "Force=T, but data doesn't have an equal length")
      
      }else if (length(regActivity)==length(geneExpression)){
    
        geneExpressionActivity <- parallel::mclapply(1:length(geneExpression),function(x){
    
      GeneInfo <- geneExpression[x]
      EnhancerInfo <- regActivity[x]
    
      #keep names, because GeneInfo is stripped off this info afterwards
      names <- c(GeneInfo$name,GeneInfo$name2)
      
      # getting shared info between gene expression and enh activity datasets
      CommonData <- colnames(mcols(GeneInfo))[colnames(mcols(GeneInfo))%in%
                                                colnames(mcols(EnhancerInfo))]
      
      # adjusting for common cell types
          mcols(GeneInfo) <- mcols(GeneInfo)[CommonData]
          mcols(EnhancerInfo) <- mcols(EnhancerInfo)[CommonData]
          
      EnhGene <- c(GeneInfo,EnhancerInfo)
            # adding meta-data   
      EnhGene$featureType <- c("gene","regulatory")
      EnhGene$name <- c(names[1],as.character(EnhancerInfo))
      EnhGene$name2 <- c(names[2],as.character(EnhancerInfo))
      
      #ordering meta-data
      FirstMetaData <- c("featureType","name","name2")
      BigWigMetaData <- colnames(mcols(EnhGene))[!colnames(mcols(EnhGene))
                                                 %in%FirstMetaData]
        mcols(EnhGene) <- mcols(EnhGene)[c(FirstMetaData,BigWigMetaData)]
      
          return(EnhGene)
            })
  
         names(geneExpressionActivity) <- geneExpression$name
  
      }
  }
  
  # testing many-to-many relationship
  
    if (force==FALSE){
      
      # per gene analysis
      geneExpression.extended <- split(geneExpression,
                            as.character(geneExpression$name))
    
        geneExpressionActivity <- parallel::mclapply(geneExpression.extended,function(x){
          
          # overlap extended geneExpression (+/-downstream and upstream) & regRegion
        geneExpression.regAct.overlap <- as.data.frame(findOverlaps(
                trim(suppressWarnings(promoters(x,upstream,downstream))),
                                                         regActivity))
          
          # if there is any overlap - GO!
          if (nrow(geneExpression.regAct.overlap)!=0) {
            regActivity <- regActivity[geneExpression.regAct.overlap$subjectHits]
            
            # adjusting mcols for reg region
                  name <- as.character(regActivity)
                  featureType <- rep("regulatory",length(name))
                  name2 <- rep(NA,length(name))
                  
                  mcols(regActivity) <- cbind(featureType,
                                              name,
                                              name2,
                                              data.frame(mcols(regActivity)))
                  
            # adjusting mcols for gene expression object
                  featureType <- "gene"
                  mcols(x) <- cbind(featureType,
                                    data.frame(mcols(x)))
                  
                  geneExpression.colnames <- colnames(mcols(x))[colnames(mcols(x))%in%
                                                     colnames(mcols(regActivity))]
                  mcols(x) <-  mcols(x)[geneExpression.colnames]
                  mcols(regActivity) <-  mcols(regActivity)[geneExpression.colnames]
                  
                  geneReg <- c(x,regActivity)
                  geneReg$featureType  <- as.character(geneReg$featureType)
                  geneReg$name   <- as.character(geneReg$name)
                  geneReg$name2   <- as.character(geneReg$name2)
                  
            return(geneReg)
            
          }
        },
        mc.cores = mc.cores)
  
    }
  
  
  # removing NULL entries
  geneExpressionActivity <- geneExpressionActivity[!sapply(geneExpressionActivity,is.null)]
  # returning GRangesList
  geneExpressionActivity <- GRangesList(geneExpressionActivity)
  
  return(geneExpressionActivity)
  
}





#' Quantifies gene expression measured by RNA-Seq  
#' 
#' 
#' The function quantifies gene expression as a sum of exon 
#' expressions quantified over pre-defined exon regions using 
#' signal from RNA-Seq tracks (bigwig files) and returns a
#' GRanges object with TSS location and corresponding gene
#' expression levels quantified over a set of samples. 
#'  
#' 
#' 
#' @param exons A \code{\link[InteractionSet]{GInteractions}} object with 
#' Anchor1 representing the exon locations, and Anchor2 represents location of 
#' the corresponding gene. A meta-columns with following info should be present:
#' 1) name (character)- ENSEMBL or other gene identifier; 2) name2 (character,
#' optional) - 2nd gene identifier. 
#' Optionally, GRanges object can be used as input object as well. It should
#' contain exon regions over which the expression will be calculated.
#' For this object, a meta-columns with following info are necessary: 
#' 1) reg - GRanges object - gene location, or TSS location, 2) name (character)
#' - ENSEMBL or other gene identifier; 3) name2(character,optional) - 
#' 2nd gene identifier.  
#' It is strongly suggested to adjust seqlengths of this object to be equal to 
#' the seqlenghts Hsapiens from the appropriate package 
#' (BSgenome.Hsapiens.UCSC.hg19 or whatever needed version).
#' @param target a named list of RNA-Seq BigWig files. Names correspond 
#' to the unique sample ids/names. Stranded and unstranded libraries allowed.
#'  BUT!!! It is crucial that forward and reverse RNA-Seq libraries are listed
#' in a row (eg one on top of each other)
#' @param sampleIDs NULL (default). A vector of unique sample
#' ids/names(.bw files), ordered as the bigwig files are ordered. When NULL
#' basenames of .bw files is used as a unique sample ids/names.
#' @param libStrand a vector of "*","+,"-" (default NULL) which needs to be 
#' entered as an argument.This vector provides info about the order of RNA-Seq 
#' libraries based on their strandness: "+" corresponds to forward/positive 
#' RNA-Seq bigwig files; "-" corresponds to reverse/negative RNA-Seq bigwig
#' files and "*" is unstranded library. When all libraries are unstranded then
#' the vector should contain a list of "*" with the lenght equal to the number
#' of analyzed RNA-Seq libraries (eg bigwig files). 
#' If libStrand=NULL than function will do it automatically, eg
#' create a vector of "*".It is crucial that stranded RNA-Seq libraries are
#' listed in a row (eg one on top of each other)
#' @param summaryOperation "mean"(default). An argument for 
#' \code{\link[genomation]{ScoreMatrixBin}} that is in the nutshell of 
#' quantifying exon expression across pre-defined exon regions.
#' This designates which summary operation should be used over the regions.
#' Currently, only mean is available, but "median" or "sum" will be implemented
#' in the future.
#' @param mc.cores (def:1) Define the number of cores to use;
#' at most how many child processes will be run simultaneously 
#' using mclapply from parallel package. Parallelization requires at 
#' least two cores.
#' @param normalize NULL(default). Optional "quantile" and "ratio"
#' If set to "quantile" activity measures are quantile normalized as 
#' implemented in \code{\link[preprocessCore]{normalize.quantiles}} and
#' returned ; if set to "ratio" then "median ratio method" implemented
#' as \code{\link[DESeq2]{estimateSizeFactorsForMatrix}} is used to normalize 
#' activity measures.
#' 
#'         
#' @return a GRanges object where its meta-columns correspond to quantified
#' gene expression across cell type or condition. GRanges correspond
#' to the TSS location of the analyzed gene. Additionally, each TSS 
#' contains following metadata:a "name" and "name2" columns for 
#' unique id or name/symbol for the gene which the TSS is associated with. 
#' One is Ensembl id and the other could be used for the gene symbol. 
#' Other metadata column names should represent sample names/ids and should
#' match the GRanges object provided via regActivity argument.
#' 
#' @details For a gene of interest and across all samples (.bw files)
#' individaully,the expression is firstly calculated for all exon 
#' regions which correspond to the gene of interest. Then, per gene
#' exon expression scores are summed together and divided by a total exon length
#' as followed:
# #'  \deqn{Gene Expression=\sum{n=1}^{Exons{n}}
# #'  (MeanExonExpression{n}*ExonLength{n})/(GeneLength)}
#' Normalization is runned on the level of gene expression. Currently, the 
#' relevant bigWig files are required to calculate activity. This function might 
#' be extended to work with BAM files in the future. RNA-Seq .bw files can 
#' originate from stranded,unstranded or mixed libraries.
#' 
#' 
#' 
#' @examples #INPUT1 DEFINING .BW FILES:
#' 
#' 
#' test.bw <- system.file("extdata", "test.bw",package = "reg2gene")
#' test2.bw <- system.file("extdata", "test2.bw",package = "reg2gene")
#' 
#' #INPUT2 DEFINING EXONS:
#' exons <- GRanges(c(rep("chr1",2),"chr2",rep("chr1",3)),
#'                       IRanges(c(1,7,9,15,1,21),c(4,8,14,20,4,25)),
#'                                             c(rep("+",3),rep("-",3)))
#'  exons$reg <-  exons[c(1,1,3,5,5,5)]
#'  exons$name2 <- exons$name <- paste0("TEST_Reg",c(1,1,3,5,5,5))
#'  bwToGeneExp(exons = exons,target = c(test.bw,test2.bw),
#'          sampleIDs=c("CellType1","CellType2"))
#'  
#'  # OUTPUT bwToGeneExp():                                                                                                                       
#' bwToGeneExp(exons = exons,target = c(test.bw,test2.bw))
#' 
#' # adding different sample IDs
#' 
#' bwToGeneExp(exons = exons,target = c(test.bw,test2.bw),
#'             sampleIDs=c("CellType1","CellType2"))
#' 
#' # if exons input is GInteractions class object,the same output is obtained
#' 
#' 
#' exonsGI= GInteractions(exons,exons$reg)
#'    exonsGI$name=exons$name
#'    exonsGI$name2=exons$name2
#'    
#' bwToGeneExp(exons = exonsGI,target = c(test.bw,test2.bw))
#' 
#' @export
bwToGeneExp <- function(exons,
                        target,
                        sampleIDs=NULL,
                        libStrand=NULL,
                        summaryOperation="mean",
                        mc.cores=1,
                        normalize=NULL){
  
  
 
  #test if there is any info about genes in exon granges object
  if (any(stringr::str_detect(colnames(mcols(exons)),"name"))==0){
    stop("No info about the gene name, e.g. name column is missing")}
  
  if ((class(exons)=="GRanges")&
      (any(stringr::str_detect(colnames(mcols(exons)),"reg"))==0)){
    stop("No info about the gene location, e.g. reg column is missing ")}
  
  if ((class(exons)=="GInteractions")&
      (any(stringr::str_detect(colnames(mcols(exons)),"name2"))==0)){
    stop("No info about gene name2,e.g. name2 column needed for GInteractions")}
  
  
  
  if (is.null(sampleIDs)) {
    bw.exts = c(".bw",".bigWig",".bigwig",".BigWig", ".BIGWIG", ".BW")
    sampleIDs <- stringr::str_replace(basename(target),
                                      paste(bw.exts,collapse="|"),"")
  }
  
  # adjusting for GInteractions object input - rearranging to GRanges object
  
  if (class(exons)=="GInteractions"){
    
    require(GenomicInteractions)
    
    exons <- GRanges(anchorOne(exons),
                        reg=anchorTwo(exons),
                        name=exons$name,
                        name2=exons$name2)
    
  }
  
  
  # order exons such that you wouldn't have problems afterwards with orientation
  exons <- exons[order(as.character(strand(exons)))]
  
  # separate genes form forward and reverse strand
  exons.splitted <- split(exons,as.character(strand(exons)))
  gene.metadata <- length(mcols(exons)) # metadata info about corresponding gene
  
  # arranging a problem of strandness of RNA-Seq libraryies
  # 1. if object libStrand is empty then strands are assumed to be "*"
  if (is.null(libStrand)) {libStrand <- rep("*",length(target))}
  
  # quantify expression per exon with adjustment to strandness of RNA-Seq 
  # libraries
  ExonExpression <- exonExpressionStrandAdjusted(exonsForQuant = exons,
                                                 exonsSplitted = exons.splitted,
                                                 target = target,
                                                 sampleIDs=sampleIDs, 
                                                 libStrand=libStrand,
                                                 mc.cores=mc.cores,
                                                 summaryOperation=summaryOperation)         
  # quantify expression per gene   
  exonExpressionPerGene <- split(ExonExpression,
                                 as.character(ExonExpression$name))
  
  GeneExpression <- list()
     for (i in 1:length(exonExpressionPerGene)){
           gene.id <- names(exonExpressionPerGene)[[i]]
          GeneExpression[[gene.id]] <- quantifyGeneExpression(
                                              exonExpressionPerGene[[gene.id]],
                                              gene.metadata)}
  
  
    GeneExpression.df <- do.call("rbind",GeneExpression)

   #############            
   # normalization
   if (!is.null(normalize)){
     GeneExpression.df <-  normalizeScores(GeneExpression.df,
                                           normalize)}


        # arranging gene coordinates as TSS
    Genes.object <- data.frame(unique(mcols(ExonExpression)[1:gene.metadata]),
                               stringsAsFactors = F)
    
    
        #extract gene location
              # TSS coord of genes
    
                 tss <- promoters(GRanges(Genes.object$reg.seqnames,
                                IRanges(Genes.object$reg.start,
                                        Genes.object$reg.end),
                                strand=Genes.object$reg.strand),
                                1,1)


        # adding meta-data
             name <- Genes.object$name
             name2 <- Genes.object$name2
                       if (length(name2)==0){
                         name2 <-  rep(NA,length(name))}# if no 2nd name
             metadata <- data.frame(name,
                                    name2,
                  GeneExpression.df[match(name,rownames(GeneExpression.df)),],
                            stringsAsFactors = F)
    
             if (length(colnames(metadata))==3) {
               
               colnames(metadata)[3] <- sampleIDs[1]
               
               }
             
          mcols(tss) <- metadata

  return(tss)
  
} 
BIMSBbioinfo/reg2gene documentation built on May 3, 2019, 6:42 p.m.