R/statistics.R

Defines functions intercompare orgintercompareinput intracompare limmacomp wilcoxcomp fishermean quantileattr

Documented in intercompare intracompare orgintercompareinput

quantileattr <- function(dat = rates, quantiles = quantilenum){
  
  absquantiles <- quantile(x = dat$absrate, probs = seq(0, 1, 1/quantiles))
  relquantiles <- quantile(x = dat$relrate, probs = seq(0, 1, 1/quantiles))
  
  absquantiles <- as.vector(absquantiles)
  relquantiles <- as.vector(relquantiles)
  
  absdat <- dat[c('gene_id', 'absrate')]
  reldat <- dat[c('gene_id', 'relrate')]
  absdat$absq <- 'quantile1'
  reldat$relq <- 'quantile1'
  
  i <- 2
  for(i in 2:length(absquantiles)){
    quantilename <- paste0('quantile', (i - 1))
    absdat$absq[absdat$absrate > absquantiles[i - 1] & absdat$absrate <= absquantiles[i]] <- quantilename
    reldat$relq[reldat$relrate > relquantiles[i - 1] & reldat$relrate <= relquantiles[i]] <- quantilename
    
  }
  
  absdat <- absdat[order(-absdat$absrate),]
  reldat <- reldat[order(-reldat$relrate),]
  row.names(absdat) <- 1:nrow(absdat)
  row.names(reldat) <- 1:nrow(reldat)
  
  resdat <- list(absres = absdat, relres = reldat)
  
  
}

fishermean <- function(line = line){
  
  a11 <- line[1]
  a12 <- line[2]
  a21 <- a22 <- mean(line)
  mat <- matrix(c(a11, a12, a21, a22), nrow = 2, byrow = TRUE)
  mat <- round(mat)
  fisherres <- fisher.test(mat)
  fisherp  <- fisherres$p.value
  
  return(fisherp)
  
}

wilcoxcomp <- function(idx, mat1 = dat1, mat2 = dat2){
  
  wilcoxres <- wilcox.test(unlist(mat1[idx,]), unlist(mat2[idx,]))
  wilcoxp <- wilcoxres$p.value
  
  return(wilcoxp)
  
}

limmacomp <- function(mat1 = dat1, mat2 = dat2){
  
  mat <- cbind(mat1, mat2)
  
  #mat <- log2(mat)
  
  pd <- data.frame(group = colnames(mat), stringsAsFactors = FALSE)
  row.names(pd) <- colnames(mat)
  pd$group <- c(rep('group1', ncol(mat1)), rep('group2', ncol(mat2)))
  
  #library(limma)
  designstr <- '~ group'
  designstr <- as.formula(designstr)
  
  design <- model.matrix(designstr, data = pd)
  fit1 <- limma::lmFit(mat, design)
  fit2 <- limma::eBayes(fit1)
  
  allg.limma <- limma::topTable(fit2, coef = 2, n = dim(fit1)[1])
  
  return(allg.limma)
  
}



#'Divide genes into different quantile groups according to their rates
#'
#'Divide genes into different quantile groups according to their rates 
#'inferred from the \code{calrate} function, or \code{mcalrate} function.
#'
#'@param inferres The result list generated by the \code{calrate} function or 
#'  \code{mcalrate} function. 
#'@param targetgenes Which genes need to be divided into quantile groups. If 
#'  it is NULL (The default value), all genes provided by the parameter 
#'  \code{inferres} will be analyzed. If its value is 'significant', only 
#'  genes judged as significant genes by \code{calrate} or \code{mcalrate} 
#'  will be analyzed. While the the value of this parameter can also be a 
#'  vector with gene symbols as elements and only genes covered by this 
#'  vector will be analyzed.
#'@param quantilenum How many quantile groups need to be divided. Default 
#'  value is 4. 
#'@return A list with 2 data.frames as elements. One of them records the 
#'  absolute elongation rates of the genes (bp/min) and divide the genes into 
#'  several quantile groups with genes in quantile1 group having the lowest 
#'  rates. The other data.frame records the relative elongation rates of the 
#'  genes (percent of gene body/min) and divide them into quantile groups. If 
#'  the value of parameter \code{inferres} is from \code{mcalrate}, the 
#'  absolute and relative rates of the genes are the mean of all the Pro-seq 
#'  or Gro-seq pairs.
#'@export
intracompare <- function(inferres, targetgenes = NULL, quantilenum = 4){
  
  if(!('report' %in% names(inferres))){
    
    i <- 1
    for(i in 1:length(inferres)){
      
      subres <- inferres[[i]]$report
      
      if(!is.null(targetgenes)){
        
        if(targetgenes == 'significant'){
          subres <- subset(subres, significance == 'significant')
        }else{
          
          subres <- subset(subres, gene_id %in% targetgenes)
          
        }
        
      }
      
      subgenes <- subres$gene_id
      
      if(i == 1){
        finalgenes <- subgenes
      }else{
        finalgenes <- intersect(finalgenes, subgenes)
      }
      
    }
    
    j <- 1
    for(j in 1:length(inferres)){
      
      subres <- inferres[[j]]$report
      subres <- subres[match(finalgenes, subres$gene_id),]
      subres$relrate <- subres$rate/(abs(subres$end - subres$start) + 1)
      subres <- subres[c('gene_id', 'rate', 'relrate')]
      names(subres) <- c('gene_id', paste0('absrate_', j), paste0('relrate_', j))
      
      subabs <- subres[,c(1, 2)]
      subrel <- subres[,c(1, 3)]
      
      if(j == 1){
        absrates <- subabs
        relrates <- subrel
      }else{
        absrates <- cbind(absrates, subabs[2])
        relrates <- cbind(relrates, subrel[2])
      }
      
      
    }
    
    absrates <- data.frame(gene_id = absrates$gene_id, absrate = rowMeans(absrates[-1]),
                           stringsAsFactors = FALSE)
    relrates <- data.frame(gene_id = relrates$gene_id, relrate = rowMeans(relrates[-1]),
                           stringsAsFactors = FALSE)
    
    rates <- cbind(absrates, relrates[-1])
    row.names(rates) <- 1:nrow(rates)
    
    
  }else{
    
    subres <- inferres$report
    
    if(!is.null(targetgenes)){
      
      if(targetgenes == 'significant'){
        subres <- subset(subres, significance == 'significant')
      }else{
        
        subres <- subset(subres, gene_id %in% targetgenes)
        
      }
      
    }
    
    subres$relrate <- subres$rate/(abs(subres$end - subres$start) + 1)
    subres <- subres[c('gene_id', 'rate', 'relrate')]
    names(subres) <- c('gene_id', 'absrate', 'relrate')
    
    rates <- subres
    row.names(rates) <- 1:nrow(rates)
    
  }
  
  res <- quantileattr(dat = rates, quantiles = quantilenum)
  
  names(res) <- c('absolute_rate', 'relative_rate')
  
  return(res)
  
}



#'Organize the results from the function \code{mcalrate} to a data.frame
#'
#'Organize the results calculated by the function \code{mcalrate} to a 
#'data.frame with the 1st column as gene symbols and the other columns as gene 
#'elongation rates inferred for each Pro-seq or Gro-seq pair. This data.frame 
#'can be used as input for the function \code{intercompare}.
#'
#'@param reslist The result list generated by the function \code{mcalrate}.
#'@param targetgenes Which genes need to be included in the output of this 
#'  function. If it is NULL (The default value), all genes provided by the 
#'  parameter \code{reslist} will be analyzed. If its value is 'significant', 
#'  only genes judged as significant genes by \code{mcalrate} will be 
#'  included. While the the value of this parameter can also be a vector with 
#'  gene symbols as elements and only genes covered by this vector will be 
#'  included. 
#'@return A data.frame with the 1st column as gene symbols and the other 
#'  columns as gene elongation rates inferred for each Pro-seq or Gro-seq 
#'  pair. This data.frame can be used as input for the function 
#'  \code{intercompare}.
#'@export
orgintercompareinput <- function(reslist, targetgenes = NULL){
  
  i <- 1
  for(i in 1:length(reslist)){
    
    res <- reslist[[i]]
    res <- res$report
    
    if(!is.null(targetgenes)){
      
      if(targetgenes == 'significant'){
        res <- subset(res, significance == 'significant')
      }else{
        
        res <- subset(res, gene_id %in% targetgenes)
        
      }
    }
    
    res <- res[c('gene_id', 'rate')]
    gene_id <- res$gene_id
    
    if(i == 1){
      gene_ids <- gene_id
      res <- res[match(gene_ids, res$gene_id),]
      reses <- res
    }else{
      gene_ids <- intersect(gene_ids, gene_id)
      res <- res[match(gene_ids, res$gene_id),]
      reses <- reses[match(gene_ids, reses$gene_id),]
      reses <- cbind(reses, res[-1])
    }
    
    
  }
  
  names(reses) <- c('gene_id', paste0('rate_', seq(1, (ncol(reses) - 1), 1)))
  
  return(reses)
  
}



#'Compare gene elongation rates between 2 different conditions
#'
#'Compare gene elongation rate difference for each gene between 2 different 
#'conditions. For each condition, if the number of replicates is < 3, Fisher's 
#'test will be used to calculate the p-values and adjusted p-values of the 
#'rate differences, while if the number of replicates is >= 3, Wilcox test and 
#'limma will be used to calculate 2 sets of p-values and adjusted p-values 
#'(Wilcox test based and limma based).
#'
#'@param inferresmat A data.frame recording the elongation rates for the genes 
#'  in different conditions. Can be generated using the function 
#'\code{orgintercompareinput}.
#'@param groupnames A vector with the elements corresponding to the condition 
#'  names of the rate columns in the data.frame indicated by the parameter 
#'  \code{inferresmat}. 
#'@return A data.frame with one row for one gene and the columns are the 
#'  average transcription elongation rates for each condition (log2 
#'  transformed), the log2FC, the p-value and adjusted p-value (adjusted by 
#'  Benjamini & Hochberg method) of the difference.
#'@export
intercompare <- function(inferresmat, groupnames){
  
  groups <- unique(groupnames)
  if(length(groups) != 2){
    
    print('Currently only comparison between 2 groups is supported, so the data should come from 2 groups')
    
    return(NULL)
    
  }
  
  groupidces <- paste0(groupnames, '.', seq(1, length(groupnames), 1))
  names(inferresmat) <- c('gene_id', groupidces)
  
  dat1 <- groupidces[groupnames == groups[1]]
  dat2 <- groupidces[groupnames == groups[2]]
  
  dat1 <- inferresmat[c('gene_id', dat1)]
  dat2 <- inferresmat[c('gene_id', dat2)]
  
  colnames(dat1) <- c('gene_id', paste0(groups[1], '.', 1:(ncol(dat1) - 1)))
  colnames(dat2) <- c('gene_id', paste0(groups[2], '.', 1:(ncol(dat2) - 1)))
  
  
  row.names(dat1) <- dat1$gene_id
  row.names(dat2) <- dat2$gene_id
  dat1 <- dat1[2:ncol(dat1)]
  dat2 <- dat2[2:ncol(dat2)]
  
  dat1 <- log2(dat1)
  dat2 <- log2(dat2)
  
  if(ncol(dat1) < 3 & ncol(dat2) < 3){
    
    dat1 <- rowMeans(dat1)
    dat2 <- rowMeans(dat2)
    
    dat <- cbind(dat1, dat2)
    
    pvals <- apply(X = dat, MARGIN = 1, FUN = fishermean)
    padjs <- p.adjust(pvals, method = 'BH')
    log2fcs <- dat2 - dat1
    
    group1rates <- dat1
    group2rates <- dat2
    gene_ids <- names(dat1)
    
  }else{
    
    limmares <- limmacomp(mat1 = dat1, mat2 = dat2)
    limmapvals <- limmares[c('P.Value', 'adj.P.Val')]
    names(limmapvals) <- c('pval(limma)', 'padj(limma)')
    limmapvals$gene_id <- row.names(limmares)
    
    testlist <- as.list(seq(1, nrow(dat1), 1))
    
    pvals <- lapply(X = testlist, FUN = wilcoxcomp, mat1 = dat1, mat2 = dat2)
    names(pvals) <- row.names(dat1)
    pvals <- unlist(pvals)
    padjs <- p.adjust(pvals, method = 'BH')
    log2fcs <- rowMeans(dat2) - rowMeans(dat1)
    
    group1rates <- rowMeans(dat1)
    group2rates <- rowMeans(dat2)
    gene_ids <- row.names(dat1)
    
  }
  
  res <- data.frame(gene_id = gene_ids, group1rate = group1rates, group2rate = group2rates,
                    pval = pvals, padj = padjs, log2FC = log2fcs, stringsAsFactors = FALSE)
  
  names(res) <- c('gene_id', paste0(groups[1], '.rate'), paste0(groups[2], '.rate'),
                  'pval', 'padj', paste0('log2FC(', groups[2], '/', groups[1], ')'))
  row.names(res) <- 1:nrow(res)
  
  if(exists('limmapvals')){
    res <- merge(res, limmapvals, by = c('gene_id'))
  }
  
  res <- res[order(res$padj, res$pval, -abs(res[,6])),]
  row.names(res) <- 1:nrow(res)
  
  return(res)
  
}
yuabrahamliu/proRate documentation built on Nov. 3, 2024, 10:14 a.m.