R/downsizing.R

#' \code{indexReads} 
#' @title indexReads
#' @description for a given column of the raw count matrix this function indexes the count vector 
#'        in a vector of reads of the length of the sum of the counts. It allows after to randomly 
#'        select a subset
#' @author Edi Prifti
#' @param v : an integer raw count vector
#' @param silent : default FALSE, debugging
#' @return an character indexed vector
indexReads <- function(v, silent=FALSE){
  # keep only reads that are not 0 for optimisation purposes
  v.no0 <- v[v!=0]
  # now inflate the vector
  res <- rep(NA,sum(v.no0))
  ind.d <- 1
  ind.f <- 1
  for(i in 1:length(v.no0)){
    if(!silent){if(i%%1000==0){print(i)}}
    ind.f <- ind.d + v.no0[i]-1
    res[ind.d : ind.f] <- paste(names(v.no0)[i],1:v.no0[i],sep="_")
    ind.d <- ind.f+1
  }
  return(res)
}

#' \code{indexReadsGC} 
#' @title indexReadsGC
#' @description for a given column of the raw count matrix this function indexes the count vector 
#'        in a vector of reads of the length of the sum of the counts. It allows after to randomly 
#'        select a subset
#' @author Edi Prifti
#' @param v : an integer raw count vector
#' @param silent : default FALSE, debugging
#' @return an character indexed vector
#' @note this function is optimized when dowsizing for gene count
indexReadsGC <- function(v, silent=TRUE){
  # keep only reads that are not 0
  v.no0 <- v[v!=0]
  # now inflate the vector
  res <- rep(NA,sum(v.no0))
  ind.d <- 1
  ind.f <- 1
  for(i in 1:length(v.no0)){
    if(!silent) if(i%%1000==0){print(i)}
    ind.f <- ind.d + v.no0[i]-1
    res[ind.d : ind.f] <- names(v.no0)[i]
    ind.d <- ind.f+1
  }
  return(res)
}

#' \code{sampleRandomly} 
#' @title sampleRandomly
#' @description This function samples randomly a unique subset of the given indexed vector
#' @author Edi Prifti
#' @param v.ind : a character vector of the indexed reads
#' @param level : default 11e6, the number of reads to be selected randomly. This sould be smaller than the size of v.ind 
#' @return a character indexed vector
sampleRandomly <- function(v.ind, level=11000000){
  res <- sample(x=v.ind, size=level, replace=F)
  return(res)
}


#' \code{countSampledGenes} 
#' @title countSampledGenes
#' @description counts the number of genes that have been sampled by un_indexing
#' @author Edi Prifti
#' @param v.samp : a character vector of the sampled indexed reads (output of sampleRandlomly)
#' @return a table with counts for each gene
countSampledGenes <- function(v.samp){
  res <- gsub("_.*","",v.samp)
  res <- table(res)
  return(res)
}

#' \code{countSampledGenesGC} 
#' @title countSampledGenesGC
#' @description counts the number of genes that have been sampled by un_indexing
#' @author Edi Prifti
#' @param v.samp : a character vector of the sampled indexed reads (output of sampleRandlomly)
#' @return a table with counts for each gene
#' @note an optimized version for the gene count downsizing
countSampledGenesGC <- function(v.samp){
  res <- unique(v.samp)
  return(length(res))
}


#' \code{downsizeGC} 
#' @title downsizeGC
#' @description This function takes a matrix with raw reads counts and computes the number of genes 
#' at a given downsizing level a given number of times.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : raw read count matrix with gene_ids as rownames
#' @param level : default 11e6, the downsizing level number of reads to be selected randomly. 
#' @param repetitions : default 30, the number of times the drawing is performed. Usually 30 or 10 to speed things out
#' @param silent : default is FALSE prints the status of downsizing
#' @return a matrix containing in rows a vector for each repetition and in columns the number of downsized genes for each sample
#' @note if the downsizing level is higher than the number of reads for a given sample than the result will be NA
downsizeGC <- function(data, level= 11000000, repetitions = 30, silent=FALSE){
  # for all the individuals of the set
  # data is supposed to contain raw reads and rownames to be id_fragment_external
  res <- matrix(NA,nrow=repetitions, ncol=ncol(data)); 
  rownames(res) <- 1:repetitions; colnames(res) <- colnames(data)
  for(ind in 1:ncol(data)){
    if(!silent) print(paste(ind,"Sample",colnames(data)[ind],"with",sum(data[,ind]),"reads and",sum(data[,ind]!=0),"genes"))
    if(level>sum(data[,ind])){
      if(!silent) print("This sample will not be downsized since the number of mapped reads is lower than the level")
    }else {
      v <- data[,ind]; names(v) <- rownames(data)
      v.ind <- indexReadsGC(v)
      for(step in 1:repetitions){
        v.samp <- sampleRandomly(v.ind=v.ind, level=level)
        res[step,ind] <- countSampledGenesGC(v.samp)
        if(!silent) print(paste("        step",step,"with",res[step,ind],"genes"))
      }
    }
  }
  return(res)
}


#' \code{downsizeGC.all} 
#' @title downsizeGC.all
#' @description This function takes a matrix with raw reads counts and computes the number of genes at different 
#' downsizing levels a given number of times. This is similar to the downsizeGC function but for optimization purposes 
#' it downsizes at different thresholds all together
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : raw read count matrix with gene_ids as rownames
#' @param levels : default seq(1E06,11E06,1E06), the downsizing levels number of reads to be selected randomly. 
#' @param repetitions : default 10, the number of times the drawing is performed. Usually 30 or 10 to speed things out
#' @param silent : default is FALSE prints the status of downsizing
#' @return a list of matrixes one per sample containing in rows a vector for each repetition and in columns the number of downsized genes for 
#'        each downsizing level
#' @note if the downsizing level is higher than the number of reads for a given sample than the result will be NA
downsizeGC.all <- function(data, levels= c(seq(1E06,11E06,1E06)), repetitions = 10, silent=FALSE){
  # for all the individuals of the set
  # data is supposed to contain raw reads and rownames to be id_fragment_external
  result <- list()
  for(ind in 1:ncol(data)){
    if(!silent) print(paste(ind,"Sample",colnames(data)[ind],"with",sum(data[,ind]),"reads and",sum(data[,ind]!=0),"genes"))
    res <- matrix(NA, nrow=repetitions, ncol=length(levels)); 
    rownames(res) <- 1:repetitions; colnames(res) <- paste("down_",levels/1E06,"M",sep="")
    for (j in 1:length(levels)){
      if(levels[j]>sum(data[,ind])){
        if(!silent) print(paste("Sample not downsized to ", levels[j]/1E06,"M since the number of mapped reads is lower",sep=""))
      }else {
        v <- data[,ind]; names(v) <- rownames(data)
        v.ind <- indexReadsGC(v)
        for(step in 1:repetitions){
          v.samp <- sampleRandomly(v.ind=v.ind, level=levels[j])
          res[step,j] <- countSampledGenesGC(v.samp)
          if(!silent) print(paste("        downsizing ",levels[j]/1E06,"M, step",step,"with",res[step,j],"genes"))
        }
      }
    }
    result[[ind]] <- res
  }
  names(result) <- colnames(data)
  return(result)
}


#' \code{downsizeMatrix} 
#' @title downsizeMatrix
#' @description takes a matrix with raw read gene counts and converts it to a downsized matrix with identical number of mapped reads 
#'      for each sample (column)
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : raw read count matrix with gene_ids as rownames
#' @param level : default 11E06, the downsizing levels number of reads to be selected randomly. 
#' @param repetitions : default 1, This can be also computed several times, but one is the error minimal downsizing strategy
#' @param silent : default is FALSE prints the status of downsizing
#' @return downsized read gene count matrix corresponding to the mean counts obtained with the selected number of independant 
#'      downsizing procedure
#' @note if the downsizing level is higher than the number of reads for a given sample than the result will be NA
downsizeMatrix <- function(data, level= 11000000, repetitions = 1, silent=FALSE){
  # for all the individuals of the set
  # data is supposed to contain raw reads and rownames to be id_fragment_external
  res <- matrix(NA,nrow=nrow(data),ncol=ncol(data)); rownames(res) <- rownames(data); colnames(res) <- colnames(data)
  for(ind in 1:ncol(data)){
    if(!silent) print(paste(ind,"Sample",colnames(data)[ind],"with",sum(data[,ind]),"reads and",sum(data[,ind]!=0),"genes"))
    if(sum(data[,ind]) < level){
      if(!silent) warning("This sample can't be downsized")
    }else{
      v <- data[,ind]; names(v) <- rownames(data)
      v.ind <- indexReads(v)
      # count reads for each sampled gene for each repetition and save these into a matrix
      mat.sampled <- matrix(0,nrow=nrow(data), ncol=repetitions); colnames(mat.sampled) <- 1:repetitions; rownames(mat.sampled) <- rownames(data)
      for(step in 1:repetitions){
        v.samp <- sampleRandomly(v.ind=v.ind, level=level)
        tmp <- countSampledGenes(v.samp)
        mat.sampled[match(names(tmp),rownames(mat.sampled)),step] <- tmp
        if(!silent) print(paste("        step",step,"with",sum(mat.sampled[,step]!=0),"genes"))
      }
      # compute the mean counts for each gene
      if(repetitions==1){
        res[,ind] <- mat.sampled
      }else{
        res[,ind] <- rowMeans(mat.sampled)
      }
    }
  }
  return(res)
}

#' \code{downsizedRichnessL2T} 
#' @title downsizedRichnessL2T
#' @description This procedure takes a list that is the result of the downsizeGC.all method and transforms 
#' it in a matrix of meaned downsied values. Each element of this list contains downsizing results for a given 
#' sample. This result is a matrix in lines the number of simulations and in columns the different downsizing levels
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param richness.list : the result of the downsizeGC.all method
#' @return A matrix with the samples in rows and the downsizing in columns
downsizedRichnessL2T <- function(richness.list){
  tmp <- lapply(richness.list,colMeans)
  richness.table <- c()
  for(i in 1:length(tmp)){richness.table <- cbind(richness.table ,tmp[[i]])}
  colnames(richness.table) <- names(tmp); rm(tmp)
  return(t(richness.table))
}


#' \code{computeUpsizedGC} 
#' @title computeUpsizedGC
#' @description This procedure takes a table of meaned downsized gene counts where at least one column 
#' is donwsized at a common minimal level. It uses this information to fit distributions of correlations 
#' between different downsized levels and "predict" values for the samples that have not the needed 
#' sequencing depth. The fitting of the models is based on the n-1 to be closer to reality and avoid accumulating errors.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param richness.table : matrix with samples in rows and downsizings in the columns as produced by downsizedRichnessL2T
#' @param side : by default is 2 (downsizings in the columns)
#' @param keep.real : by default is TRUE. Substitue the predicted values by the real values when is not NA
#' @param plot : default FALSE, plots the regressions
#' @return  matrix with the same dimensions as richness.table but with complete values.
#' @seealso \code{\link{downsizedRichnessL2T}} and \code{\link{downsizeGC.all}}
computeUpsizedGC <- function(richness.table, side = 2, keep.real = TRUE, plot=FALSE){
  # regressions to be performed columnwise
  if(side!=2) richness.table <- t(richness.table)
  
  # by default we take the column having all of the points and we upsize upwords
  
  # compute pairwise regressions ans save the parameters
  richness.table <- richness.table[,order(colSums(apply(richness.table, 2, is.na)))] # order using the number of NAs
  coef <- c()
  for(i in 2:ncol(richness.table)){
    tmp <- lm(richness.table[,i] ~ richness.table[,i-1])
    if(plot) {plot(richness.table[,i] ~ richness.table[,i-1], main=colnames(richness.table)[i]); abline(tmp,col="red")}
    coef <- rbind(coef,tmp$coefficients)    
  }
  rownames(coef) <- colnames(richness.table)[-1]
  colnames(coef) <- c("intercept","slope")
  
  # compute the upsized scores
  res <- matrix(NA, nrow=nrow(richness.table), ncol=ncol(richness.table)); colnames(res) <- colnames(richness.table); rownames(res) <- rownames(richness.table)
  res[,1] <- round(richness.table[,1],0)
  for(i in 2:ncol(richness.table)){
    # There are two ways of doing this either by basing at the closest more accurate or to the first we have data
    # we chose the second option richness.table[,i]
    res[,i] <- round(res[,i-1] * coef[i-1,"slope"] + coef[i-1,"intercept"],0) # extrapolate
    
    if(keep.real){ # Subsitute the predicted values by the real ones
      res[,i][!is.na(richness.table[,i])] <- round(richness.table[,i][!is.na(richness.table[,i])],0) # keep the real data
    }
    if(plot) {plot(richness.table[,i] ~ res[,i]); abline(lm(richness.table[,i] ~ res[,i]),col="red")} # plot for checking
  }
  colnames(res) <- paste(colnames(res),"una",sep="_") # upsized NA
  return(res)
}

#' End of section and file

Try the momr package in your browser

Any scripts or data that you put into this service are public.

momr documentation built on May 2, 2019, 4:21 a.m.