R/downsizing.R

Defines functions computeUpsizedGC downsizedRichnessL2T downsizeGC.all downsizeGC countSampledGenesGC countSampledGenes sampleRandomly indexReadsGC

Documented in computeUpsizedGC countSampledGenes countSampledGenesGC downsizedRichnessL2T downsizeGC downsizeGC.all indexReadsGC sampleRandomly

#' \code{indexReads} 
#' @title indexReads
#' @description for a given column of the raw count matrix this function indexes the count vector 
#'        in a vector of reads of the length of the sum of the counts. It allows after to randomly 
#'        select a subset
#' @author Edi Prifti
#' @param v : an integer raw count vector
#' @param silent : default FALSE, debugging
#' @return an character indexed vector
indexReads <- function (v, silent = FALSE) 
{
  v.no0 <- v[v != 0]
  res <- rep(NA, sum(v.no0))
  ind.d <- 1
  ind.f <- 1
  for (i in 1:length(v.no0)) {
    if (!silent & i%%10000 == 0) print(i)
    ind.f <- ind.d + v.no0[i] - 1
    res[ind.d:ind.f] <- paste(names(v.no0)[i], 1:v.no0[i],  sep = "_")
    ind.d <- ind.f + 1
  }
  return(res)
}


#' \code{indexReadsGC} 
#' @title indexReadsGC
#' @description for a given column of the raw count matrix this function indexes the count vector 
#'        in a vector of reads of the length of the sum of the counts. It allows after to randomly 
#'        select a subset
#' @author Edi Prifti
#' @param v : an integer raw count vector
#' @param silent : default FALSE, debugging
#' @return an character indexed vector
#' @note this function is optimized when dowsizing for gene count
indexReadsGC <- function(v, silent=TRUE)
{
  # keep only reads that are not 0
  v.no0 <- v[v!=0]
  # now inflate the vector
  res <- rep(NA,sum(v.no0))
  ind.d <- 1
  ind.f <- 1
  for(i in 1:length(v.no0))
  {
    if(!silent) if(i%%1000==0){print(i)}
    ind.f <- ind.d + v.no0[i]-1
    res[ind.d : ind.f] <- names(v.no0)[i]
    ind.d <- ind.f+1
  }
  return(res)
}

#' \code{sampleRandomly} 
#' @title sampleRandomly
#' @export
#' @description This function samples randomly a unique subset of the given indexed vector
#' @author Edi Prifti
#' @param v.ind : a character vector of the indexed reads
#' @param level : default 11e6, the number of reads to be selected randomly. This sould be smaller than the size of v.ind 
#' @return a character indexed vector
sampleRandomly <- function(v.ind, level=11000000)
{
  res <- sample(x=v.ind, size=level, replace=F)
  return(res)
}


#' \code{countSampledGenes} 
#' @title countSampledGenes
#' @export
#' @description counts the number of genes that have been sampled by un_indexing
#' @author Edi Prifti
#' @param v.samp : a character vector of the sampled indexed reads (output of sampleRandlomly)
#' @return a table with counts for each gene
countSampledGenes <- function(v.samp)
{
  res <- gsub("_.*","",v.samp)
  res <- table(res)
  return(res)
}

#' \code{countSampledGenesGC} 
#' @title countSampledGenesGC
#' @export
#' @description counts the number of genes that have been sampled by un_indexing
#' @author Edi Prifti
#' @param v.samp : a character vector of the sampled indexed reads (output of sampleRandlomly)
#' @return a table with counts for each gene
#' @note an optimized version for the gene count downsizing
countSampledGenesGC <- function(v.samp)
{
  res <- unique(v.samp)
  return(length(res))
}


#' \code{downsizeGC} 
#' @title downsizeGC
#' @export
#' @description This function takes a matrix with raw reads counts and computes the number of genes 
#' at a given downsizing level a given number of times.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : raw read count matrix with gene_ids as rownames
#' @param level : default 11e6, the downsizing level number of reads to be selected randomly. 
#' @param repetitions : default 30, the number of times the drawing is performed. Usually 30 or 10 to speed things out
#' @param silent : default is FALSE prints the status of downsizing
#' @return a matrix containing in rows a vector for each repetition and in columns the number of downsized genes for each sample
#' @note if the downsizing level is higher than the number of reads for a given sample than the result will be NA
downsizeGC <- function(data, level= 11000000, repetitions = 30, silent=FALSE)
{
  # for all the individuals of the set
  # data is supposed to contain raw reads and rownames to be id_fragment_external
  res <- matrix(NA,nrow=repetitions, ncol=ncol(data)); 
  rownames(res) <- 1:repetitions; colnames(res) <- colnames(data)
  for(ind in 1:ncol(data)){
    if(!silent) print(paste(ind,"Sample",colnames(data)[ind],"with",sum(data[,ind]),"reads and",sum(data[,ind]!=0),"genes"))
    if(level>sum(data[,ind])){
      if(!silent) print("This sample will not be downsized since the number of mapped reads is lower than the level")
    }else {
      v <- data[,ind]; names(v) <- rownames(data)
      v.ind <- indexReadsGC(v)
      for(step in 1:repetitions){
        v.samp <- sampleRandomly(v.ind=v.ind, level=level)
        res[step,ind] <- countSampledGenesGC(v.samp)
        if(!silent) print(paste("        step",step,"with",res[step,ind],"genes"))
      }
    }
  }
  return(res)
}


#' \code{downsizeGC.all} 
#' @title downsizeGC.all
#' @export
#' @description This function takes a matrix with raw reads counts and computes the number of genes at different 
#' downsizing levels a given number of times. This is similar to the downsizeGC function but for optimization purposes 
#' it downsizes at different thresholds all together
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : raw read count matrix with gene_ids as rownames
#' @param levels : default seq(1E06,11E06,1E06), the downsizing levels number of reads to be selected randomly. 
#' @param repetitions : default 10, the number of times the drawing is performed. Usually 30 or 10 to speed things out
#' @param silent : default is FALSE prints the status of downsizing
#' @return a list of matrixes one per sample containing in rows a vector for each repetition and in columns the number of downsized genes for 
#'        each downsizing level
#' @note if the downsizing level is higher than the number of reads for a given sample than the result will be NA
downsizeGC.all <- function(data, levels= c(seq(1E06,11E06,1E06)), repetitions = 10, silent=FALSE)
{
  # for all the individuals of the set
  # data is supposed to contain raw reads and rownames to be id_fragment_external
  result <- list()
  for(ind in 1:ncol(data)){
    if(!silent) print(paste(ind,"Sample",colnames(data)[ind],"with",sum(data[,ind]),"reads and",sum(data[,ind]!=0),"genes"))
    res <- matrix(NA, nrow=repetitions, ncol=length(levels)); 
    rownames(res) <- 1:repetitions; colnames(res) <- paste("down_",levels/1E06,"M",sep="")
    for (j in 1:length(levels))
    {
      if(levels[j]>sum(data[,ind]))
      {
        if(!silent) print(paste("Sample not downsized to ", levels[j]/1E06,"M since the number of mapped reads is lower",sep=""))
      }else 
      {
        v <- data[,ind]; names(v) <- rownames(data)
        v.ind <- indexReadsGC(v)
        for(step in 1:repetitions)
        {
          v.samp <- sampleRandomly(v.ind=v.ind, level=levels[j])
          res[step,j] <- countSampledGenesGC(v.samp)
          if(!silent) print(paste("        downsizing ",levels[j]/1E06,"M, step",step,"with",res[step,j],"genes"))
        }
      }
    }
    result[[ind]] <- res
  }
  names(result) <- colnames(data)
  return(result)
}


#' \code{downsizeMatrix} 
#' @title downsizeMatrix
#' @export
#' @description takes a matrix with raw read gene counts and converts it to a downsized matrix with identical number of mapped reads 
#'      for each sample (column)
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : raw read count matrix with gene_ids as rownames
#' @param level : default 11E06, the downsizing levels number of reads to be selected randomly. 
#' @param HQ_reads : To take into account a vector of high quality reads that are not mapped.
#' @param repetitions : default 1, This can be also computed several times, but one is the error minimal downsizing strategy
#' @param silent : default is FALSE prints the status of downsizing
#' @return downsized read gene count matrix corresponding to the mean counts obtained with the selected number of independant 
#'      downsizing procedure
#' @note if the downsizing level is higher than the number of reads for a given sample than the result will be NA
# modified august 5 by elc to take into account the HQ_reads and not the mapped reads for downsizing
# modified 29 oct 2015 by EP fix match NAs
downsizeMatrix <- function (data, level = 1.1e+07, HQ_reads=NULL, repetitions = 1, silent = FALSE)
{
  res <- as.data.frame(matrix(NA, nrow = nrow(data), ncol = ncol(data)))
  rownames(res) <- rownames(data)
  colnames(res) <- colnames(data)
  if (is.null(HQ_reads))
  {
    for (ind in 1:ncol(data)) 
    {
      if (!silent) 
      {
        print(paste(ind, "Sample", colnames(data)[ind], "with", sum(data[, ind]), "reads and", sum(data[, ind] != 0), "genes"))
      }
      if (sum(data[, ind]) < level) 
      {
        if (!silent) warning("This sample can't be downsized")
      } else 
      {
        v <- data[, ind]
        names(v) <- rownames(data)
        v.ind <- indexReads(v, silent=silent)
        mat.sampled <- matrix(0, nrow = nrow(data), ncol = repetitions)
        colnames(mat.sampled) <- 1:repetitions
        rownames(mat.sampled) <- rownames(data)
        for (step in 1:repetitions) 
        {
          v.samp <- sampleRandomly(v.ind = v.ind, level = level)
          tmp <- countSampledGenes(v.samp)
          index <- match(names(tmp), rownames(mat.sampled))
          if(sum(is.na(index))>0)
          {
            mat.sampled[index[!is.na(index)], step] <- tmp[!is.na(index)]
          }else
          {
            mat.sampled[index, step] <- tmp
          }
          if (!silent) print(paste("        step", step, "with", sum(mat.sampled[, step] != 0), "genes"))
        }
        if (repetitions == 1) 
        {
          res[, ind] <- mat.sampled
        } else 
        {
          res[, ind] <- rowMeans(mat.sampled)
        }
      }
    }
  } else 
  {
    if (length(HQ_reads) != ncol(data))
    {
      stop("a vector with the number of HQ_reads used for the mapping of each sample is expected")
    }
    ratio <- HQ_reads/level
    N <- colSums(data)  # reads mapped
    level_samples <- N/ratio
    for (ind in 1:ncol(data)) 
    {
      if (!silent) 
      {
        print(paste(ind, "Sample", colnames(data)[ind], "with", sum(data[, ind]), "reads and", sum(data[, ind] != 0), "genes"))
      }
      if (sum(data[, ind]) < level_samples[ind]) 
      {
        if (!silent) warning("This sample can't be downsized")
      }
      else 
      {
        v <- data[, ind]
        names(v) <- rownames(data)
        v.ind <- indexReads(v)
        mat.sampled <- matrix(0, nrow = nrow(data), ncol = repetitions)
        colnames(mat.sampled) <- 1:repetitions
        rownames(mat.sampled) <- rownames(data)
        for (step in 1:repetitions) 
        {
          v.samp <- sampleRandomly(v.ind = v.ind, level = level_samples[ind])
          tmp <- countSampledGenes(v.samp)
          if(sum(is.na(index))>0)
          {
            mat.sampled[index[!is.na(index)], step] <- tmp[!is.na(index)]
          }else
          {
            mat.sampled[index, step] <- tmp
          }
          if (!silent) print(paste("        step", step, "with", sum(mat.sampled[, step] != 0), "genes"))
        }
        if (repetitions == 1) 
        {
          res[, ind] <- mat.sampled
        } else 
        {
          res[, ind] <- rowMeans(mat.sampled)
        }
      }
    }
  }
  return(res)
}



#' \code{downsizedRichnessL2T} 
#' @title downsizedRichnessL2T
#' @export
#' @description This procedure takes a list that is the result of the downsizeGC.all method and transforms 
#' it in a matrix of meaned downsied values. Each element of this list contains downsizing results for a given 
#' sample. This result is a matrix in lines the number of simulations and in columns the different downsizing levels
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param richness.list : the result of the downsizeGC.all method
#' @return A matrix with the samples in rows and the downsizing in columns
downsizedRichnessL2T <- function(richness.list)
{
  tmp <- lapply(richness.list, colMeans)
  richness.table <- c()
  for(i in 1:length(tmp)){richness.table <- cbind(richness.table ,tmp[[i]])}
  colnames(richness.table) <- names(tmp); rm(tmp)
  return(t(richness.table))
}


#' \code{computeUpsizedGC} 
#' @title computeUpsizedGC
#' @export
#' @description This procedure takes a table of meaned downsized gene counts where at least one column 
#' is donwsized at a common minimal level. It uses this information to fit distributions of correlations 
#' between different downsized levels and "predict" values for the samples that have not the needed 
#' sequencing depth. The fitting of the models is based on the n-1 to be closer to reality and avoid accumulating errors.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param richness.table : matrix with samples in rows and downsizings in the columns as produced by downsizedRichnessL2T
#' @param side : by default is 2 (downsizings in the columns)
#' @param keep.real : by default is TRUE. Substitue the predicted values by the real values when is not NA
#' @param plot : default FALSE, plots the regressions
#' @return  matrix with the same dimensions as richness.table but with complete values.
#' @seealso \code{\link{downsizedRichnessL2T}} and \code{\link{downsizeGC.all}}
computeUpsizedGC <- function(richness.table, side = 2, keep.real = TRUE, plot=FALSE)
{
  # regressions to be performed columnwise
  if(side!=2) richness.table <- t(richness.table)
  
  # by default we take the column having all of the points and we upsize upwords
  
  # compute pairwise regressions ans save the parameters
  richness.table <- richness.table[,order(colSums(apply(richness.table, 2, is.na)))] # order using the number of NAs
  coef <- c()
  for(i in 2:ncol(richness.table))
  {
    tmp <- lm(richness.table[,i] ~ richness.table[,i-1])
    if(plot) {plot(richness.table[,i] ~ richness.table[,i-1], main=colnames(richness.table)[i]); abline(tmp,col="red")}
    coef <- rbind(coef,tmp$coefficients)    
  }
  rownames(coef) <- colnames(richness.table)[-1]
  colnames(coef) <- c("intercept","slope")
  
  # compute the upsized scores
  res <- matrix(NA, nrow=nrow(richness.table), ncol=ncol(richness.table)); colnames(res) <- colnames(richness.table); rownames(res) <- rownames(richness.table)
  res[,1] <- round(richness.table[,1],0)
  for(i in 2:ncol(richness.table))
  {
    # There are two ways of doing this either by basing at the closest more accurate or to the first we have data
    # we chose the second option richness.table[,i]
    res[,i] <- round(res[,i-1] * coef[i-1,"slope"] + coef[i-1,"intercept"],0) # extrapolate
    
    if(keep.real)
    { # Subsitute the predicted values by the real ones
      res[,i][!is.na(richness.table[,i])] <- round(richness.table[,i][!is.na(richness.table[,i])],0) # keep the real data
    }
    if(plot) {plot(richness.table[,i] ~ res[,i]); abline(lm(richness.table[,i] ~ res[,i]),col="red")} # plot for checking
  }
  colnames(res) <- paste(colnames(res),"una",sep="_") # upsized NA
  return(res)
}

#' End of section and file
eprifti/momr documentation built on Sept. 27, 2022, 3:36 a.m.