R/processing.R

Defines functions PoolColumns ComputePseudoNtr ComputeColumnStatistics ComputeTotalExpression ComputeExpressionPercentage FilterGenes Scale NormalizeBaseline NormalizeTPM NormalizeRPM NormalizeFPKM Normalize ComputeAbsolute ComputeSteadyStateHalfLives ComputeNtrPosteriorUpper ComputeNtrPosteriorLower ComputeNtrCI ComputeNtrPosteriorQuantile comp.rpm comp.fpkm comp.tpm comp.hl

Documented in ComputeAbsolute ComputeColumnStatistics ComputeExpressionPercentage ComputeNtrCI ComputeNtrPosteriorLower ComputeNtrPosteriorQuantile ComputeNtrPosteriorUpper ComputePseudoNtr ComputeSteadyStateHalfLives ComputeTotalExpression FilterGenes Normalize NormalizeBaseline NormalizeFPKM NormalizeRPM NormalizeTPM PoolColumns Scale

comp.hl=function(p,time=1) ifelse(p==0,Inf,log(2)/(-1/time*log(1-p)))

comp.tpm=function(cmat,lengths,subset=NULL) {
  zerolen=lengths==0
  lengths[zerolen]=1
  rpk=cmat/(lengths/1000)
  rpk[zerolen,]=NA
  scale=colSums(if(!is.null(subset)) rpk[subset,] else rpk,na.rm=T)/1E6
  re=t(t(rpk)/scale)
  re
}

comp.fpkm=function(cmat,lengths,subset=NULL) {
  scale=colSums(if(!is.null(subset)) cmat[subset,] else cmat,na.rm=T)/1E6
  rpm=t(t(cmat)/scale)

  zerolen=lengths==0
  lengths[zerolen]=1
  re=rpm/(lengths/1000)
  re[zerolen,]=NA
  re
}
comp.rpm=function(cmat,subset=NULL,factor=1E6) {
  scale=colSums(if(!is.null(subset)) cmat[subset,] else cmat,na.rm=T)/factor
  re=t(t(cmat)/scale)
  re
}

#' Compute NTR quantiles
#'
#' Computes quantiles from the NTR posterior and puts them into a new slot
#'
#' @param data the grandR object
#' @param quantile which quantile to compute
#' @param name the name of the new slot to put quantile values in
#' @param name.lower the name of the new slot to put the lower bound of the CI in
#' @param name.upper the name of the new slot to put the upper bound of the CI in
#' @param CI.size A number between 0 and 1 representing the size of the credible interval
#'
#' @details The NTR posterior distribution can be approximated by a beta distribution.
#'
#' @details ComputeNtrPosteriorQuantile computes any quantile from this Beta approximation
#'
#' @details ComputeNtrPosteriorLower computes the (1-CI.size)/2 quantile
#'
#' @details ComputeNtrPosteriorUpper computes the 1-(1-CI.size)/2 quantile
#'
#' @details ComputeNtrCI computes both of these quantiles.
#'
#' @return a new grandR object containing an additional slot
#' @export
#'
#' @concept preprocess
ComputeNtrPosteriorQuantile=function(data,quantile,name) {
  a=as.matrix(GetTable(data,type="alpha",name.by = "Gene"))
  b=as.matrix(GetTable(data,type="beta",name.by = "Gene"))
  v=qbeta(quantile,a,b)
  AddSlot(data,name,v)
}
#' @rdname ComputeNtrPosteriorQuantile
#' @export
ComputeNtrCI=function(data,CI.size=0.95,name.lower="lower",name.upper="upper") {
  data=ComputeNtrPosteriorLower(data=data,CI.size=CI.size,name=name.lower)
  data=ComputeNtrPosteriorUpper(data=data,CI.size=CI.size,name=name.upper)
  data
}
#' @rdname ComputeNtrPosteriorQuantile
#' @export
ComputeNtrPosteriorLower=function(data,CI.size=0.95,name="lower") ComputeNtrPosteriorQuantile(data=data,quantile=(1-CI.size)/2,name=name)
#' @rdname ComputeNtrPosteriorQuantile
#' @export
ComputeNtrPosteriorUpper=function(data,CI.size=0.95,name="upper") ComputeNtrPosteriorQuantile(data=data,quantile=1-(1-CI.size)/2,name=name)


#' Steady state half-lives for each sample
#'
#' Transforms each NTR to a half-life value (assuming steady state gene expression) and puts them into a new slot or adds an analysis
#'
#' @param data the grandR object
#' @param time either a number indicating the labeling time, or a name of the \link{Coldata} table
#' @param name the name of the new slot/analysis to put half-life values in
#' @param columns which columns (i.e. samples or cells) to return; sets as.analysis to TRUE (see details)
#' @param max.HL all values above this will be set to this
#' @param CI.size A number between 0 and 1 representing the size of the credible interval
#' @param compute.CI it TRUE, credible intervals are computed, this also sets as.analysis to TRUE
#' @param as.analysis if TRUE add the results as analysis and not as data slot
#'
#' @details An NTR value p can be transformed into an RNA half-live using the equation
#' log(2)/(-1/t*log(1-p))
#' This is described in our GRAND-SLAM paper (Juerges et al., Bioinformatics 2018).
#'
#' @details Columns can be given as a logical, integer or character vector representing a selection of the columns (samples or cells).
#' The expression is evaluated in an environment havin the \code{\link{Coldata}}, i.e. you can use names of \code{\link{Coldata}} as variables to
#' conveniently build a logical vector (e.g., columns=Condition=="x").
#'
#' @return a new grandR object with an additional slot or analysis
#' @export
#'
#' @concept snapshot
ComputeSteadyStateHalfLives=function(data,time=Design$dur.4sU,name="HL", columns=NULL, max.HL=48, CI.size=0.95, compute.CI=FALSE, as.analysis=FALSE) {
  if (is.character(time) && length(time)==1) time=Coldata(data,time)

  ntrs=as.matrix(GetTable(data,type="ntr",name.by = "Gene"))

  if (length(time)==1) time=rep(time,ncol(ntrs))
  stopifnot(ncol(ntrs)==length(time))
  time=setNames(time,colnames(ntrs))

  columns=substitute(columns)
  columns=if (is.null(columns)) colnames(data) else eval(columns,Coldata(data),parent.frame())
  columns=Columns(data,columns)

  time=time[columns]

  if (length(columns)!=ncol(data)) as.analysis=TRUE

  if (compute.CI) {
    as.analysis=TRUE
    data=ComputeNtrCI(data,CI.size = CI.size)
    lower=as.matrix(GetTable(data,type="lower",name.by = "Gene"))
    upper=as.matrix(GetTable(data,type="upper",name.by = "Gene"))
    hls = do.call(cbind,lapply(columns,function(i) cbind(
      pmin(comp.hl(p = upper[,i],time = time[i]),max.HL),
      pmin(comp.hl(p = ntrs[,i],time = time[i]),max.HL),
      pmin(comp.hl(p = lower[,i],time = time[i]),max.HL)
    )))
    colnames(hls)=paste0(rep(c("Half-life.lower.","Half-life.MAP.","Half-life.upper."),ncol(ntrs)),rep(columns,each=3))
  }
  else {
    hls = sapply(columns,function(i) comp.hl(p = ntrs[,i],time = time[i]))
    colnames(hls)=columns
  }

  if (as.analysis) {
    AddAnalysis(data,name = name,table = as.data.frame(hls))
  } else {
    AddSlot(data,name,hls)
  }
}


#' Compute absolute expression using ERCC spike ins
#'
#' Compute absolute expression in a grandR object and puts the normalized data into a new slot
#'
#' @param data the grandR object
#' @param dilution the dilution of the spikein transcript in the lysis reaction mix
#' @param volume 	the approximate volume of the lysis chamber (nanoliters)
#' @param slot the slot containing relative expression values
#' @param name the name of the new slot to put absolute expression values in
#'
#' @seealso \link[monocle]{relative2abs}
#' @return a new grandR object with an additional slot
#' @export
#'
#' @concept preprocess
ComputeAbsolute=function(data,dilution=4E4,volume=10,slot="tpm",name="absolute") {
  checkPackages(c("monocle","VGAM"))
  fd=data.frame(gene_short_name=Genes(data))
  rownames(fd)=Genes(data)
  mat=GetTable(data,type=slot)
  ercc=mat[GeneInfo(data,"Type")=='ERCC',]

  cds <- monocle::newCellDataSet(mat,
                        featureData = methods::new ("AnnotatedDataFrame",fd),
                        lowerDetectionLimit = 0.1,
                        expressionFamily = VGAM::tobit(Lower = 0.1))
  rpc_matrix <- monocle::relative2abs(cds, method = "num_genes", ERCC_controls=ercc, ERCC_annotation=monocle::spike_df,dilution=dilution, volume=volume)
  rpc_matrix[mat]=0

  colnames(rpc_matrix)=colnames(data)
  rownames(rpc_matrix)=Genes(data)
  rpc_matrix[,grepl("test",colnames(rpc_matrix))]=NA
  AddSlot(data,name,rpc_matrix)
}



#' Normalization
#'
#' Normalizes data in a grandR object and puts the normalized data into a new slot
#'
#' @param data the grandR object
#' @param genes compute the normalization w.r.t. these genes (see details)
#' @param name the name of the new slot for the normalized data
#' @param slot the name of the slot for the data to normalize
#' @param set.to.default set the new slot as the default slot
#' @param size.factors numeric vector; if not NULL, use these size factors instead of computing size factors
#' @param return.sf return the size factors and not a grandR object
#' @param tlen the transcript lengths (for FPKM and TPM)
#' @param factor the rpm factor (default: 1 (m)illion)
  #'
#' @details Normalize will perform DESeq2 normalization, i.e. it will use \link[DESeq2]{estimateSizeFactorsForMatrix}
#' to estimate size factors, and divide each value by this. If genes are given, size factors will be computed only w.r.t. these genes (but then all genes are normalized).
#'
#' @details NormalizeFPKM will compute fragments per kilobase and million mapped reads. If genes are given, the scaling factor
#' will only be computed w.r.t. these genes (but then all genes are normalized).
#'
#' @details NormalizeRPM will compute reads per million mapped reads. If genes are given, the scaling factor
#' will only be computed w.r.t. these genes (but then all genes are normalized).
#'
#' @details NormalizeTPM will compute transcripts per million mapped reads. If genes are given, the scaling factor
#' will only be computed w.r.t. these genes (but then all genes are normalized).
#'
#' @details Genes can be referred to by their names, symbols, row numbers in the gene table, or a logical vector referring to the gene table rows.
#'
#' @return a new grandR object with a new data slot
#'
#' @seealso \code{\link{NormalizeBaseline}}
#'
#' @export
#'
#' @examples
#' sars <- ReadGRAND(system.file("extdata", "sars.tsv.gz", package = "grandR"),
#'                   design=c("Cell",Design$dur.4sU,Design$Replicate))
#'
#' sars <- Normalize(sars)
#' DefaultSlot(sars)
#' @concept preprocess
Normalize=function(data,genes=Genes(data),name="norm",slot="count",set.to.default=TRUE,size.factors=NULL,return.sf=FALSE) {
  stopifnot(is.grandR(data))
  mat=as.matrix(GetTable(data,type=slot,genes=genes,ntr.na = FALSE,name.by = "Gene"))
  if (is.null(size.factors)) {
    loggeomeans=rowMeans(log(mat))
    size.factors=apply(mat, 2, function(cnts) {
      exp(stats::median((log(cnts) - loggeomeans)[is.finite(loggeomeans) &
                                              cnts > 0]))
    })
  }
  if (return.sf) return(size.factors)

  mat=as.matrix(GetTable(data,type=slot,ntr.na = FALSE,name.by = "Gene"))
  data=AddSlot(data,name,matrix = t(t(mat)/size.factors),set.to.default=set.to.default)
  data
}
#' @rdname Normalize
#' @export
NormalizeFPKM=function(data,genes=Genes(data),name="fpkm",slot="count",set.to.default=TRUE,tlen=GeneInfo(data,"Length")) {
  genes=ToIndex(data,genes)
  stopifnot(is.grandR(data))
  mat=as.matrix(GetTable(data,type=slot,ntr.na = FALSE,name.by = "Gene"))
  data=AddSlot(data,name,comp.fpkm(mat,tlen,subset = genes),set.to.default=set.to.default)
  data
}
#' @rdname Normalize
#' @export
NormalizeRPM=function(data,genes=Genes(data),name="rpm",slot="count",set.to.default=TRUE,factor=1E6) {
  genes=ToIndex(data,genes)
  stopifnot(is.grandR(data))
  mat=as.matrix(GetTable(data,type=slot,ntr.na = FALSE,name.by = "Gene"))
  data=AddSlot(data,name,comp.rpm(mat,subset = genes,factor=factor),set.to.default=set.to.default)
  data
}
#' @rdname Normalize
#' @export
NormalizeTPM=function(data,genes=Genes(data),name="tpm",slot="count",set.to.default=TRUE,tlen=GeneInfo(data,"Length")) {
  genes=ToIndex(data,genes)
  stopifnot(is.grandR(data))
  mat=as.matrix(GetTable(data,type=slot,ntr.na = FALSE,name.by = "Gene"))
  data=AddSlot(data,name,comp.tpm(mat,tlen,subset = genes),set.to.default=set.to.default)
  data
}

#' Normalization to a baseline
#'
#' Normalizes data in a grandR object to a baseline and puts the normalized data into a new slot
#'
#' @param data the grandR object
#' @param baseline matrix defining the corresponding baseline (row) for each column (sample or cell; see details)
#' @param name the name of the new slot for the normalized data
#' @param slot the name of the slot for the data to normalize
#' @param set.to.default set the new slot as the default slot
#' @param LFC.fun either \link[lfc]{NormLFC} or \link[lfc]{PsiLFC} from the lfc package
#' @param ... forwarded to LFC.fun
#'
#' @details Baseline normalization computes the log2 fold change for a column (i.e. sample or cell) to a baseline columns (or several baseline columns). This is by default done using the
#' \code{\link[lfc]{PsiLFC}} function from the lfc package, which, by default, also normalizes log2 fold changes by adding a constant
#' such that the median is zero.
#'
#' @details Baselines are defined by a square logical matrix, defining for each sample or cell of the grandR object, represented by the column of the matrix,
#' which samples or cells are indeed the baseline (represented by the rows). Such matrices can conveniently be obtained by \code{\link{FindReferences}}.
#'
#' @return a new grandR object with an additional slot
#'
#' @seealso \code{\link{Normalize}},\code{\link{FindReferences}}
#' @export
#'
#' @examples
#' sars <- ReadGRAND(system.file("extdata", "sars.tsv.gz", package = "grandR"),
#'                   design=c("Cell",Design$dur.4sU,Design$Replicate))
#' blmat <- FindReferences(sars,reference = duration.4sU==0, group = "Cell")
#' # the Mock.no4sU or SARS.no4sU sample are the baselines for each sample
#' sars <- NormalizeBaseline(sars,baseline=blmat)
#' head(GetTable(sars,type="baseline"))
#' @concept preprocess
NormalizeBaseline=function(data,baseline=FindReferences(data,reference=Condition==levels(Condition)[1]),name="baseline",slot=DefaultSlot(data),set.to.default=FALSE,LFC.fun=lfc::PsiLFC,...) {
  stopifnot(is.grandR(data))
  mat=as.matrix(GetTable(data,type=slot,ntr.na = FALSE,name.by = "Gene"))
  mat=sapply(colnames(baseline),function(n) LFC.fun(mat[,n],rowMeans(mat[,baseline[,n],drop=FALSE]),...))
  data=AddSlot(data,name,mat,set.to.default=set.to.default)
  data
}


#' Scale data
#'
#' Compute values for all genes standardized (i.e. z scores) across samples.
#'
#' @param data a grandR object
#' @param name the new slot name
#' @param slot the slot from where to take values
#' @param set.to.default set the new slot as default slot
#' @param group Perform standardization per group of columns (see details)
#' @param center Perform centering (forwarded to \link[base]{scale})
#' @param scale Perform scaling (forwarded to \link[base]{scale})
#'
#' @details Standardization can be done per group. For this, the group parameter has to be a name of the \code{\link{Coldata}} table,
#' to define groups of columns (i.e. samples or cells).
#'
#' @seealso \link[base]{scale}
#'
#' @return a new grandR object with a new slot
#' @export
#'
#' @concept preprocess
Scale=function(data,name="scaled",slot=DefaultSlot(data),set.to.default=FALSE,group=NULL,center=TRUE,scale=TRUE) {
  stopifnot(is.grandR(data))
  mat=as.matrix(GetTable(data,type=slot,ntr.na = FALSE,name.by = "Gene"))
  if (is.null(group)) {
    mat=t(scale(t(mat),center = center, scale = scale))
  } else {
    gr=Coldata(data)[[group]]
    for (c in unique(gr)) mat[,gr==c]=t(scale(t(mat[,gr==c]),center = center, scale = scale))
  }
  data=AddSlot(data,name,mat,set.to.default=set.to.default)
  data
}



#' Filter genes
#'
#' Return a grandR object with fewer genes than the given grandR object (usually to filter out weakly expressed genes).
#'
#' @param data the grandR object
#' @param mode.slot the mode.slot that is used for filtering (see details)
#' @param minval the minimal value for retaining a gene
#' @param mincol the minimal number of columns (i.e. samples or cells) a gene has to have a value >= minval
#' @param min.cond if not NULL, do not compare values per column, but per condition (see details)
#' @param use if not NULL, defines the genes directly that are supposed to be retained (see details)
#' @param keep if not NULL, defines genes directly, that should be kept even though they do not adhere to the filtering criteria (see details)
#' @param return.genes if TRUE, return the gene names instead of a new grandR object
#'
#' @details By default genes are retained, if they have 100 read counts in at least half of the columns (i.e. samples or cells).
#'
#' @details The \code{use} parameter can be used to define genes to be retained directly. The \code{keep} parameter, in contrast, defines
#' \emph{additional} genes to be retained. For both, genes can be referred to by their names, symbols, row numbers in the gene table,
#' or a logical vector referring to the gene table rows.
#'
#' @details To refer to data slots, the mode.slot syntax can be used: Each name is either a data slot, or one of (new,old,total)
#' followed by a dot followed by a slot. For new or old, the data slot value is multiplied by ntr or 1-ntr. This can be used e.g. to filter by \emph{new counts}.
#'
#' @details if the \code{min.cond} parameter is given, first all columns belonging to the same \code{\link{Condition}} are summed up, and then the usual filtering
#' is performed by conditions instead of by columns.
#'
#' @return either a new grandR object (if return.genes=FALSE), or a vector containing the gene names that would be retained
#'
#' @export
#'
#' @examples
#'
#' sars <- ReadGRAND(system.file("extdata", "sars.tsv.gz", package = "grandR"),
#'                   design=c("Condition",Design$dur.4sU,Design$Replicate))
#'
#' nrow(sars)
#' # This is already filtered and has 1045 genes
#' nrow(FilterGenes(sars,minval=1000))
#' # There are 966 genes with at least 1000 read counts in half of the samples
#' nrow(FilterGenes(sars,minval=10000,min.cond=1))
#' # There are 944 genes with at least 10000 read counts in the Mock or SARS condition
#' nrow(FilterGenes(sars,use=GeneInfo(sars,"Type")!="Cellular"))
#' # These are the 11 viral genes.
#'
#' @concept preprocess
FilterGenes=function(data,mode.slot='count',minval=100,mincol=ncol(data)/2,min.cond=NULL,use=NULL,keep=NULL,return.genes=FALSE) {
  if (!is.null(use) & !is.null(keep)) stop("Do not specify both use and keep!")

  if (length(mode.slot)!=1) stop("One mode.slot must be given!")
  if (!all(check.mode.slot(data,mode.slot))) stop(sprintf("mode.slot %s unknown!",paste(mode.slot[!check.mode.slot(data,mode.slot)],collapse=",")))

  if (is.null(use)) {
    summi = if (!is.null(min.cond)) GetSummarizeMatrix(data,no4sU=TRUE,average=FALSE) else NULL
    mincol=if (!is.null(min.cond)) min.cond else mincol
    t=GetMatrix(data,mode.slot=mode.slot,summarize = summi)
    use=Matrix::rowSums(t>=minval,na.rm=TRUE)>=mincol
    #use=apply(t,1,function(v) sum(v>=minval,na.rm=TRUE)>=mincol)
    if (!is.null(keep)) use = use | (1:length(use)) %in% ToIndex(data,keep)
  }
  use=ToIndex(data,use)

  if (return.genes) return(unname(use))
  return(data.apply(data,function(t) t[use,],fun.gene.info = function(t) t[use,]))
}


#' Expression percentage computation
#'
#' Compute the expression percentage for a particular set of genes.
#'
#' @param data the grandR object
#' @param name the new name by which this is added to the Coldata
#' @param genes define the set of genes to compute the percentage for
#' @param mode.slot which mode.slot to take the values for computing the percentage from
#' @param genes.total define the set of genes defining the total value
#' @param mode.slot.total which mode.slot to take the values for computing the total
#' @param multiply.by.100 if TRUE, compute percentage values, otherwise fractions between 0 and 1
#'
#' @seealso \code{\link{Coldata}}
#'
#' @details The percentages are computed for the given genes with the given mode.slot, w.r.t the mode.slot.total from the genes.total. Thus
#' to compute the percentage of mitochondrial gene expression in total RNA (unnormalized), only set genes=Genes(data,"^MT-",regex=TRUE).
#' To compute the percentage of new RNA among all genes, set mode.slot="new.count" and mode.slot.total="count".
#'
#' @details Genes can be referred to by their names, symbols, row numbers in the gene table, or a logical vector referring to the gene table rows.
#'
#' @details To refer to data slots, the mode.slot syntax can be used: Each name is either a data slot, or one of (new,old,total)
#' followed by a dot followed by a slot. For new or old, the data slot value is multiplied by ntr or 1-ntr. This can be used e.g. to filter by \emph{new counts}.
#'
#' @return a new grandR object having the expression percentage in its Coldata table
#' @export
#'
#' @concept data
ComputeExpressionPercentage=function(data,name,genes=Genes(data),mode.slot=DefaultSlot(data),genes.total=Genes(data),mode.slot.total=mode.slot,multiply.by.100=TRUE) {
  gof=Matrix::colSums(GetMatrix(data,mode.slot=mode.slot,genes = genes))
  total=Matrix::colSums(GetMatrix(data,mode.slot=mode.slot.total,genes = genes.total))
  percentage=gof/total
  if (multiply.by.100) percentage=percentage*100
  Coldata(data,name)=percentage
  data
}

#' Total expression computation
#'
#' Compute the total expression for a particular set of genes.
#'
#' @param data the grandR object
#' @param name the new name by which this is added to the Coldata
#' @param genes define the set of genes to compute the percentage for
#' @param mode.slot which mode.slot to take the values for computing the percentage from
#'
#' @seealso \code{\link{Coldata}}
#'
#' @details Genes can be referred to by their names, symbols, row numbers in the gene table, or a logical vector referring to the gene table rows.
#'
#' @details To refer to data slots, the mode.slot syntax can be used: Each name is either a data slot, or one of (new,old,total)
#' followed by a dot followed by a slot. For new or old, the data slot value is multiplied by ntr or 1-ntr. This can be used e.g. to filter by \emph{new counts}.
#'
#' @return a new grandR object having the total expression in its Coldata table
#' @export
#'
#' @concept data
ComputeTotalExpression=function(data,name,genes=Genes(data),mode.slot=DefaultSlot(data)) {
  Coldata(data,name)=Matrix::colSums(GetMatrix(data,mode.slot=mode.slot,genes = genes))
  data
}


#' Compute statistics for all columns (i.e. samples or cells)
#'
#' @param data a grandR object
#' @param verbose output status messages
#'
#' @return a new grandR object containing additional columns in the \link{Coldata} table:
#' \itemize{
#'   \item{p.conv.X: the T-to-C mismatch frequency in the given ("X") subread category}
#'   \item{percent.new: new overall percentage of new RNA}
#'   \item{total.reads: the total number of reads (or UMIs, if UMIs were sequences)}
#'   \item{total.genes: the total number of genes detected}
#'   \item{percentage per type: the percentage (up to 100!) of the counts of each type in the GeneInfo}
#' }
#'
#' @export
#'
#' @concept data
ComputeColumnStatistics=function(data,verbose=TRUE) {
  if (verbose) cat(sprintf("Obtaining model parameters...\n"))
  if (data$metadata$`GRAND-SLAM version`==3) {
    tab=GetTableQC(data,"model.parameters",stop.if.not.exist = FALSE)
    if (!is.null(tab)) {
      tab=tab[tab$Label==tab$Label[1] & tab$Estimator==tab$Estimator[1],]
      for (sub in unique(tab$Subread)) {
        m=setNames(tab$`Binom p.conv`[tab$Subread==sub],tab$Condition[tab$Subread==sub])
        Coldata(data,paste("p.conv",sub))=m[colnames(data)]
      }
    }
  } else {
    tab=GetTableQC(data,"mismatches",stop.if.not.exist=FALSE)
    if (!is.null(tab)) {
      strand=GetTableQC(data,"strandness",stop.if.not.exist=FALSE)$V1
      if (strand=="Antisense") {
        tab=tab[(tab$Orientation=="First" & tab$Genomic=="A" & tab$Read=="G")|(tab$Orientation=="Second" & tab$Genomic=="T" & tab$Read=="C"),]
      } else if (strand=="Sense") {
        tab=tab[(tab$Orientation=="First" & tab$Genomic=="T" & tab$Read=="C")|(tab$Orientation=="Second" & tab$Genomic=="A" & tab$Read=="G"),]
      } else {
        tab=tab[(tab$Genomic=="T" & tab$Read=="C")|(tab$Genomic=="A" & tab$Read=="G"),]
      }
      tab=tab[tab$Category=="Exonic",]
      tab=plyr::ddply(tab,c("Condition"),function(s) data.frame(Coverage=sum(s$Coverage),Mismatches=sum(s$Mismatches)))
      m=setNames(tab$Mismatches/tab$Coverage,tab$Condition)
      Coldata(data,"raw.conversions")=m[colnames(data)]
    }

    tab=GetTableQC(data,"rates",stop.if.not.exist=FALSE)
    if (!is.null(tab)) {
      m=unlist(tab[tab$Rate=="single_new",-1])
      Coldata(data,"p.conv.single")=m[colnames(data)]
      Coldata(data,"p.conv.single")[Coldata(data,"no4sU")]=NA
      m=unlist(tab[tab$Rate=="double_new",-1])
      if (any(m!=0)) {
        Coldata(data,"p.conv.double")=m[colnames(data)]
        Coldata(data,"p.conv.double")[Coldata(data,"no4sU")]=NA
      }
    }
  }

  if (verbose) cat(sprintf("Compute percentage new...\n"))
  data=ComputeExpressionPercentage(data,"percent.new",mode.slot="new.count",mode.slot.total="count")

  if (verbose) cat(sprintf("Compute total reads...\n"))
  Coldata(data,"total.reads") = Matrix::colSums(GetMatrix(data,mode.slot="count"))
  if (verbose) cat(sprintf("Compute total genes...\n"))
  Coldata(data,"total.genes") = Matrix::colSums(GetMatrix(data,mode.slot="count")>0)

  if (!is.null(GeneInfo(data)$Type)) {
    for (t in levels(GeneInfo(data)$Type)) {
      if (verbose) cat(sprintf("Compute percent for %s...\n",t))
      Coldata(data,paste0("percent.",t)) = Matrix::colSums(GetMatrix(data,mode.slot="count",genes = GeneInfo(data)$Type==t))/Coldata(data,"total.reads")*100
    }
  }

  data
}

#' Compute pseudo NTRs from two count matrices
#'
#' NTRs can be computed from given new and total counts.
#'
#' @param data a grandR object
#' @param new.slot the slot containing new RNA counts
#' @param total.slot the slot containing total RNA counts
#' @param detection.rate the detection rate of T-to-C mismatch reads (see details)
#'
#' @details To correct for some bias, a detection rate (as suggested by Cao et al., Nature Biotech 2020) should be provided. This detection rate
#' defines, how much new RNA is detected on average using the T-to-C mismatch reads.
#'
#' @return a new grandR object
#' @export
#'
#' @concept data
ComputePseudoNtr=function(data,new.slot,total.slot=DefaultSlot(data),detection.rate=1) {
  if (!check.slot(data,new.slot,allow.ntr=FALSE) || !check.slot(data,total.slot,allow.ntr=FALSE)) stop("Slot unknown!")
  n=GetMatrix(data,mode.slot=new.slot,name.by="Gene")
  t=GetMatrix(data,mode.slot=total.slot,name.by="Gene")

  if (is.matrix(t)) {
    ntr=pmin((n*detection.rate)/t,1)
    ntr[is.nan(ntr)]=0
  } else {
    sX=Matrix::summary(n)
    sY=Matrix::summary(t)
    dd=.Call('fastsparsematdiv',sX$i,sX$j,sX$x,sY$i,sY$j,sY$x,detection.rate)

    ntr=Matrix::sparseMatrix(i=sX$i, j=sX$j, x=dd,dimnames=dimnames(t))
  }

  AddSlot(data,"ntr",ntr,set.to.default=FALSE)
}

#' Pool reads across columns
#'
#' Pool read counts, ntrs, and alpha/beta values across columns defined by a pooling matrix
#'
#' @param data grandR object
#' @param pooling a pooling matrix (see details)
#'
#' @details The pooling matrix must have as many rows as there are columns (i.e., samples or cells) in data,
#' and as many columns as you want to have columns in the resulting object. The matrix should consist of 0 and 1,
#' where 1 indicates a column of the original object that should go into a column of the new object. In essence,
#' to obtain the new count matrix, the old count matrix is matrix-multiplied with the pooling matrix.
#'
#' @details The new ntr matrix is computed by componentwise division of the new count and total count matrices derived
#' as just described. alpha and beta are computed using matrix multiplication, i.e. summing up all alpha and beta values
#' of all the columns belonging to a pool.
#'
#' @return a new grandR object
#' @export
#'
#' @concept data
PoolColumns=function(data,pooling=GetSummarizeMatrix(data,average=FALSE,no4sU=TRUE)) {
  dd=list()
  dd$count = GetMatrix(data,mode.slot="count",summarize=pooling,name.by="Gene")
  newcount = GetMatrix(data,mode.slot="new.count",summarize=pooling,name.by="Gene")
  if (is.matrix(dd$count)) {
    dd$ntr=newcount/dd$count
  } else {
    sX=Matrix::summary(newcount)
    sY=Matrix::summary(dd$count)
    dd=.Call('fastsparsematdiv',sX$i,sX$j,sX$x,sY$i,sY$j,sY$x,1.0)
    ntr=Matrix::sparseMatrix(i=sX$i, j=sX$j, x=dd,dimnames=dimnames(dd$count))
  }
  if (all(c("alpha","beta") %in% Slots(data))) {
    dd$alpha=GetMatrix(data,mode.slot="alpha",summarize=pooling,name.by="Gene")
    dd$beta=GetMatrix(data,mode.slot="beta",summarize=pooling,name.by="Gene")
  }

  cd=data.frame(Name=colnames(pooling))
  rownames(cd)=colnames(pooling)
  for (add in setdiff(names(Coldata(data)),"Name")) {
    col=Coldata(data,add)
    col=apply(pooling,2,function(cc) {
      h=unique(col[cc!=0])
      if (length(h)!=1) h=NA
      h
    })
    if (sum(is.na(col))<length(col)) {
      cd[[add]]=col
    }
  }

  re=grandR(data$prefix,gene.info=data$gene.info,slots=dd,coldata=cd,metadata=data$metadata,analyses=data$analyses)
  DefaultSlot(re)="count"
  re
}

Try the grandR package in your browser

Any scripts or data that you put into this service are public.

grandR documentation built on April 4, 2025, 2:27 a.m.