R/addNoise.R

Defines functions is.whole is.whole.matrix addNoiseToCommMatrix addNoise

Documented in addNoise

#' @title Add Noise
#' @description Add noise to time series.
#'
#' @details Noise is either generated by setting a noise parameter to the observed value and sampling a new value from this noise distribution (add=FALSE)
#' or by sampling the selected noise distribution with user-provided parameters and adding the noise value to each time point independently (add=TRUE).
#' When choosing to add Gaussian noise and the resulting value is negative, it will be set to zero.
#' When generating modified values with the negative binomial and add set to false, the resulting value is added to the observed value.
#' It is not recommended to enable add when using the Poisson or negative binomial distribution.
#'
#' @param x matrix with taxa as rows and time points as columns (optional, can be set to NULL if an input.folder is provided)
#' @param input.folder folder from which to read time series (optional, input folder should have subfolders as generated by function generateTS.R)
#' @param output.folder folder to which modified time series are written (optional)
#' @param expIds set of experiment identifiers to process (only needed if input.folder is provided)
#' @param noise.distrib which noise distribution to use (pois = Poisson, the default, nb = negative binomial, norm = Gaussian, multi = multinomial)
#' @param add whether to sample the abundance value from the noise distribution (the default) or whether to add noise, where the noise value is sampled from the noise distribution with user-specified parameters (add is not possible for noise.distrib = multi)
#' @param seqdepths specify a vector of sequencing depths with as many entries as x has columns or a single value if sequencing depth should be constant (only needed for noise.distrib = multi)
#' @param vary.seqdepth if a constant sequencing depth is provided: vary sequencing depth by adding a value randomly selected from the uniform distribution between 0 and seqdepths/2 (only applicable for noise.distrib = multi)
#' @param scaling.factor if unequal to one: scale observed values with the given scaling factor and round (rounding will convert non-count data into counts, recommended for Poisson and negative binomial distribution)
#' @param param1 first parameter for noise distribution (when add is false: standard deviation for Gaussian distribution, success probability for negative binomial, when add is true: mean for Gaussian distribution, lambda for Poisson, size for negative binomial)
#' @param param2 second parameter for noise distribution (only when add is true: standard deviation for Gaussian distribution, success probability for negative binomial)
#' @param scale.noise divide noise value by scale.noise before adding it to the observed value (only when add is true)
#' @return if x is provided without an output.folder, the modified matrix is returned; if input.folder is provided without an output folder, the last processed experiment identifier is returned
#' @examples
#' \dontrun{
#'   N <- 50
#'   metapop <- generateAbundances(N=500, mode=5, probabs=TRUE)
#'   ts <- simHubbell(N=N, M=500,I=1500,d=N, m.vector=metapop, tskip=500, tend=1000)
#'   ts.noisy <- addNoise(ts)
#'  }
#' @export
addNoise<-function(x, input.folder="", output.folder="", expIds=c(), add=FALSE, seqdepths=c(), vary.seqdepth=FALSE, scaling.factor=1, noise.distrib="pois", param1=1, param2=1, scale.noise=1){
  if((is.null(x) && input.folder=="") || (!is.null(x) && input.folder!="")){
    stop("Please either provide the time series matrix or the input folder of the time series.")
  }

  if(!is.null(x) && length(seqdepths)>1){
    if(length(seqdepths) != ncol(x)){
      stop("Please provide either a single sequencing depth value or as many values as there are columns in x.")
    }
  }

  if(noise.distrib=="multi" && length(seqdepths)==0){
    stop("Please provide sequencing depths for noise distribution multinomial (multi).")
  }

  if(noise.distrib=="multi" && add==TRUE){
    warning("Noise distribution multi does not support option add. Add is set to FALSE.")
    add=FALSE
  }

  if(output.folder != ""){
    if(!file.exists(output.folder)){
      dir.create(output.folder)
    }
  }

  if(input.folder != ""){
    if(!file.exists(input.folder)){
      stop(paste("The input folder",input.folder,"does not exist!"))
    }
    input.timeseries.folder=file.path(input.folder,"timeseries")
    if(!file.exists(input.timeseries.folder)){
      stop("The input folder does not have a time series subfolder!")
    }

    lastExpId=expIds[length(expIds)]
    for(expId in expIds){
      print(paste("Processing identifier",expId))

      input.timeseries.name=paste(expId,"timeseries",sep="_")
      input.timeseries.expId.folder=file.path(input.timeseries.folder,input.timeseries.name)
      if(!file.exists(input.timeseries.expId.folder)){
        stop("The input time series folder does not have a subfolder for the input experiment identifier!")
      }

      # read time series file
      ts.name=paste(expId,"timeseries.txt",sep="_")
      input.path.ts=file.path(input.timeseries.expId.folder,ts.name)
      print(paste("Reading time series from:",input.path.ts,sep=" "))
      ts=read.table(file=input.path.ts,sep="\t",header=FALSE)
      ts=as.matrix(ts)

      if(length(seqdepths)>1){
        if(length(seqdepths) != ncol(ts)){
          stop("Please provide either a single sequencing depth value or as many values as there are columns in x.")
        }
      }

      if(!is.whole.matrix(ts) && scaling.factor==1 && (noise.distrib=="pois" || noise.distrib=="nb")){
        warning(paste("Time series for experiment",expId,"contains non-round numbers. Please scale.",sep=" "))
      }

      if(scaling.factor>1){
        ts=round(ts*scaling.factor)
      }

      noisyTS=addNoiseToCommMatrix(ts,noise.distrib = noise.distrib, add=add, seqdepths=seqdepths, vary.seqdepth=vary.seqdepth, param1=param1, param2=param2, scale.noise=scale.noise)

      # save time series
      if(output.folder!=""){
        ts.name=paste(expId,noise.distrib,"timeseries.txt",sep="_")
        ts.path=file.path(output.folder,ts.name)
        write(t(noisyTS),file=ts.path,ncolumns=ncol(noisyTS),sep="\t")
      }else{
        if(expId==lastExpId){
          return(noisyTS)
        }
      }

    } # end loop expIds

  }else{
    if(scaling.factor>1){
      x=round(x*scaling.factor)
    }
    noisyTS=addNoiseToCommMatrix(x, noise.distrib = noise.distrib, add=add, seqdepths=seqdepths, vary.seqdepth=vary.seqdepth, param1=param1, param2=param2, scale.noise=scale.noise)
    if(output.folder==""){
      return(noisyTS)
    }else{
      ts.name=paste(noise.distrib,"timeseries.txt",sep="_")
      ts.path=file.path(output.folder,ts.name)
      write(t(noisyTS),file=ts.path,ncolumns=ncol(noisyTS),sep="\t")
    }
  }

}

# x matrix with taxa as rows and time points as columns
addNoiseToCommMatrix<-function(x, noise.distrib="pois", add=FALSE, seqdepths=c(), vary.seqdepth=FALSE, param1=1, param2=1, scale.noise=1){
  # print value range
  maxVal=max(x)
  meanVal=mean(x)
  sums=apply(x,2,sum)
  print(paste("Maximum value in the matrix:",maxVal))
  print(paste("Mean value in the matrix:",meanVal))
  print(paste("Minimum sample sum:",min(sums)))
  print(paste("Maximum sample sum:",max(sums)))

  # multivariate noise
  if(noise.distrib=="multi"){
    for(j in 1:ncol(x)){
      seqdepth=NA
      if(length(seqdepths)>1){
        seqdepth=seqdepths[j]
      }else{
        seqdepth=seqdepths
        if(vary.seqdepth==TRUE){
          seqdepth=seqdepth+runif(1,min=0, max=seqdepth/2)
        }
      }
      prob=x[,j]/sum(x[,j])
      # for very small abundances, negative probabilities can be introduced and need to be treated
      if(length(which(prob<0))>0){
        #print(paste("Sample",j,"contains negative probability! Negative probability is set to 0."))
        neg.indices=which(prob<0)
        prob[neg.indices]=0
      }
      # generate one sample with the multinomial distribution and with the given target sequencing depth
      x[,j]=rmultinom(1,size=seqdepth,prob=prob)
    }
  # univariate noise
  }else{
    # loop rows
    for(i in 1:nrow(x)){
      for(j in 1:ncol(x)){
        value=NA
        error=NA
        if(add==TRUE){
          if(noise.distrib=="pois"){
            error=(rpois(1,lambda=param1))/scale.noise
          }else if(noise.distrib=="norm"){
            error=(rnorm(1,mean=param1,sd=param2))/scale.noise
          }else if(noise.distrib=="nb"){
            error=rnbinom(1,size=param1,prob=param2)
          }else{
            stop(paste("Noise type",noise.distrib,"not supported."))
          }
          value=x[i,j]+error
        }else{
          if(noise.distrib=="pois"){
            value=rpois(1,lambda=x[i,j])
          }else if(noise.distrib=="norm"){
            value=rnorm(1,mean=x[i,j],sd=param1)
          }else if(noise.distrib=="nb"){
            value=x[i,j]+rbinom(1,size=x[i,j],prob=param1)
          }else{
            stop(paste("Noise type",noise.distrib,"not supported."))
          }
        }
        # prevent negative values
        if(value<0){
          value=0
        }
        x[i,j]=value
      } # column loop
    } # row loop
  } # univariate noise
  return(x)
}

# test whether matrix only contains integers.
is.whole.matrix<-function(x){
  if(length(which(apply(x,2,is.whole)==FALSE))>0){
    return(FALSE)
  }else{
    return(TRUE)
  }
}

# taken from: https://stat.ethz.ch/pipermail/r-help/2003-April/032471.html
is.whole <- function(a, tol = 1e-7) {
  is.eq <- function(x,y) {
    r <- all.equal(x,y, tol=tol)
    is.logical(r) && r
  }
  (is.numeric(a) && is.eq(a, floor(a))) ||
    (is.complex(a) && {ri <- c(Re(a),Im(a)); is.eq(ri, floor(ri))})
}
hallucigenia-sparsa/seqtime documentation built on Jan. 9, 2023, 11:53 p.m.