R/umi_correction.R

Defines functions BuildCountMatrixFromReads PlotCorrectionSize SampleNoReps EditDistanceDistribution GetExprDf GetPlotExrDf PlotTrimmedCorrections TrimAndCorrect TrimUmisSummary MaxCharFreq MaxFreqDistribution SampleNumOfAdjacentUmis SampleNumbersOfAdjacentUmis

#' @export
BuildCountMatrixFromReads <- function(filt.rpus, reads.per.umi.per.cb.info, collisions.info=NULL) {
  filt.umis.per.gene <- sapply(filt.rpus, length)
  if (!is.null(collisions.info)) {
    filt.umis.per.gene <- collisions.info[filt.umis.per.gene]
  }
  reads.per.umi.per.cb.info$umis_per_gene <- filt.umis.per.gene
  return(dropestr::BuildCountMatrix(reads.per.umi.per.cb.info))
}

#' @export
PlotCorrectionSize <- function(cms, correction.colors, xlim=NULL, ylim=NULL, min.umi.per.gene=10,
                               facet=T, mapping=NULL, ...) {
  if (is.null(mapping)) {
    mapping <- ggplot2::aes(x=`no correction`, y=`no correction`-value, color=Correction)
  }
  plot.df <- lapply(cms, function(cm) cm@x) %>% tibble::as_tibble() %>%
    dplyr::filter(`no correction` > min.umi.per.gene) %>%
    reshape2::melt(id.vars='no correction', variable.name='Correction')

  gg <- ggplot2::ggplot(plot.df) +
    ggrastr::geom_point_rast(mapping, size = 0.2, alpha=0.05, ...) +
    ggplot2::geom_abline(aes(intercept=0, slope=1)) +
    ggplot2::scale_x_log10(limits=xlim, expand=c(0, 0)) +
    ggplot2::scale_y_log10(limits=ylim, expand=c(0, 0)) +
    ggplot2::annotation_logticks(sides='b') +
    ggplot2::scale_color_manual(values=correction.colors) +
    ggrastr::theme_pdf(legend.pos = c(1, 0)) +
    ggplot2::theme(strip.text.x=ggplot2::element_blank(), panel.spacing=ggplot2::unit(3, 'pt')) +
    ggplot2::guides(color=ggplot2::guide_legend(override.aes=list(alpha=1.0, size=1.5), title='Correction'))

  if (facet) {
    gg <- gg + ggplot2::facet_wrap(~Correction)
  }

  return(gg)
}

#' @export
SampleNoReps <- function(size, ids, probs) {
  umis <- unique(sample(ids, size=size, prob=probs, replace=T))
  while (length(umis) < size) {
    umis <- unique(c(umis, sample(ids, size=size, prob=probs, replace=T)))
  }

  return(umis[1:size])
}

#' @export
EditDistanceDistribution <- function(umis.per.gene, mc.cores=1) {
  umi.length <- nchar(umis.per.gene[[1]][1])
  probs <- parallel::mclapply(umis.per.gene, PairwiseHamming, mc.cores=mc.cores) %>%
    unlist() %>% dropestr::ValueCounts(return_probs=T)
  probs <- probs[paste(1:umi.length)]
  probs[is.na(probs)] <- 0
  names(probs) <- paste(1:umi.length)
  return(probs)
}

# UMI trimming
GetExprDf <- function(filt.umis.per.gene.vec, real.umis.per.gene.vec, raw.umis.per.gene.vec) {
  expressions <- as.data.frame(filt.umis.per.gene.vec)
  expressions$RealValue <- real.umis.per.gene.vec
  expressions$NoCorrection <- raw.umis.per.gene.vec
  return(expressions %>% dplyr::filter(NoCorrection > 1))
}

GetPlotExrDf <- function(trimmed.cur, raw, umi.length, return.all=FALSE, adjusted.raw=FALSE) {
  raw.name <- ifelse(adjusted.raw, 'umis.per.gene.adj', 'umis.per.gene')
  expr.df <- GetExprDf(trimmed.cur$filt_cells, raw$filt_umis_per_gene_simple, trimmed.cur[[raw.name]]) %>%
    reshape2::melt(id.vars=c('RealValue'), variable.name='Correction') %>% dplyr::mutate(UmiLen=umi.length)

  if (return.all)
    return(expr.df)

  return(expr.df %>% dplyr::group_by(Correction, RealValue) %>%
           dplyr::summarise(Max=quantile(value, 0.95), Min=quantile(value, 0.05), TMean=mean(value, trim=0,2)))
}

#' @export
PlotTrimmedCorrections <- function(trimmed.data, raw.data, trimed.length, log=T, adjusted.raw=FALSE, raster=T,
                                   rast.width=NULL, rast.height=NULL, rast.dpi=300, heights.ratio=c(1, 1)) {
  if (raster) {
    geom_point_w <- function(...) ggrastr::geom_point_rast(..., width=rast.width, height=rast.height * heights.ratio[1], dpi=rast.dpi)
  } else {
    geom_point_w <- ggplot2::geom_point
  }
  heights.ratio <- heights.ratio / sum(heights.ratio)

  plot.df <- GetPlotExrDf(trimmed.data, raw.data, trimed.length, adjusted.raw=adjusted.raw)
  plot.df.all <- GetPlotExrDf(trimmed.data, raw.data, trimed.length, return.all=T, adjusted.raw=adjusted.raw)

  plot.labs <- ggplot2::labs(x='Corrected #UMIs without trimming', y='Error on trimmed data, %',
                             title=paste0('Length of trimmed UMI: ', trimed.length))

  plot.df.subset <- plot.df.all %>% dplyr::filter(RealValue < 50) %>%
    dplyr::mutate(RealValueRound=ceiling(RealValue / 5) * 5)
  title.theme <- ggplot2::theme(plot.title=ggplot2::element_text(margin=ggplot2::margin(0, 0, 0.03, 0, 'in')))

  labels <- c('Bayesian', 'cluster', 'cluster-neq', 'directional', 'no correction')
  color.scale <- ggplot2::scale_color_manual(values=c("#017A5A", "#9B3BB8", "#E69F00", "#BD5500", '#757575'), labels=labels)
  # plot.df.subset$RealValueRound[plot.df.subset$RealValue == 1] <- 1

  trans <- if(log) 'log10' else 'identity'
  gg.large <- ggplot2::ggplot(plot.df,
                              ggplot2::aes(x=RealValue,
                                           y=100 * (TMean - RealValue) / RealValue,
                                           color=Correction, linetype=Correction)) +
    geom_point_w(size=0.3, alpha=0.3) +
    ggplot2::geom_smooth(alpha=0.1) +
    ggplot2::scale_linetype_manual(values=c(rep('solid', 4), 'dashed'), guide=F) +
    color.scale +
    ggplot2::scale_x_continuous(expand=c(0, 0), limits=c(1, 7999), trans=trans) +
    ggplot2::scale_y_continuous(expand=c(0, 0), limits=c(-101, 50)) +
    ggplot2::theme(legend.position=c(0, 0), legend.justification=c(0, 0),
                   strip.background=ggplot2::element_blank(),
                   strip.text=ggplot2::element_text(size=14)) +
    title.theme +
    ggplot2::guides(color=ggplot2::guide_legend(ncol=3)) +
    plot.labs

  gg.small <- ggplot2::ggplot(plot.df.subset,
                              ggplot2::aes(x=as.factor(RealValueRound), y=100 * (value - RealValue) / RealValue, color=Correction)) +
    ggrastr::geom_boxplot_jitter(outlier.jitter.width=NULL, outlier.jitter.height=1, outlier.size=0.3,
                                 outlier.alpha=0.7, raster=raster, raster.width=rast.width,
                                 raster.height=rast.height * heights.ratio[2], raster.dpi=rast.dpi) +
    color.scale +
    ggplot2::ylim(-50, 50) +
    ggplot2::theme(legend.position="none") + title.theme +
    plot.labs

  return(list(small=gg.small, large=gg.large))
}

#' @export
TrimAndCorrect <- function(reads.per.umi.per.cb.info, umi.trim.length, mc.cores.large, mc.cores.small,
                           verbosity.level=0, prepare.only=FALSE) {
  reads.per.umi.per.cb <- reads.per.umi.per.cb.info$reads_per_umi

  if (length(reads.per.umi.per.cb) == 0)
    stop("Empty input")

  if (length(reads.per.umi.per.cb[[1]][[1]][[2]]) == 0)
    stop("Information about quality is required for UMI correction")

  trimmed <- list()
  trimmed$reads.per.umi.per.cb <- lapply(reads.per.umi.per.cb, dropestr::TrimUmis, umi.trim.length)
  trimmed$umis.per.gene <- sapply(trimmed$reads.per.umi.per.cb, length)

  trimmed.reads.per.umi.per.cb <- reads.per.umi.per.cb.info
  trimmed.reads.per.umi.per.cb$reads_per_umi <- trimmed$reads.per.umi.per.cb

  umi.distribution <- dropestr::GetUmisDistribution(trimmed$reads.per.umi.per.cb, umi.trim.length)
  trimmed$umi.probabilities <- umi.distribution / sum(umi.distribution)

  max.umi.per.gene <- max(trimmed$umis.per.gene)

  trimmed$collisions.info <- dropestr::FillCollisionsAdjustmentInfo(trimmed$umi.probabilities, max.umi.per.gene)
  max.umi.per.gene.adj <- trimmed$collisions.info[max.umi.per.gene]

  trimmed$correction.info <- dropestr::PrepareUmiCorrectionInfo(trimmed$umi.probabilities, max.umi.per.gene.adj,
                                                                verbosity.level=if (verbosity.level > 1) verbosity.level else 0)

  if (prepare.only)
    return(trimmed)

  filt_cells <- list()
  filt_cells$Bayesian <- trimmed.reads.per.umi.per.cb %>%
    dropestr::CorrectUmiSequenceErrors(umi.probabilities=trimmed$umi.probabilities, collisions.info=trimmed$collisions.info,
                                       correction.info=trimmed$correction.info, mc.cores=mc.cores.large, return='umis',
                                       verbosity.level=verbosity.level)

  filt_cells$cluster <- trimmed.reads.per.umi.per.cb %>%
    dropestr::CorrectUmiSequenceErrors(umi.probabilities=trimmed$umi.probabilities, collisions.info=trimmed$collisions.info,
                                       correction.info=trimmed$correction.info, mc.cores=mc.cores.small,
                                       verbosity.level=verbosity.level, return='umis', mult=1, method='Classic')

  filt_cells$`cluster-neq` <- trimmed.reads.per.umi.per.cb %>%
    dropestr::CorrectUmiSequenceErrors(umi.probabilities=trimmed$umi.probabilities, collisions.info=trimmed$collisions.info,
                                       correction.info=trimmed$correction.info, mc.cores=mc.cores.small,
                                       verbosity.level=verbosity.level, return='umis', mult=1 + 1e-5, method='Classic')

  filt_cells$directional <- trimmed.reads.per.umi.per.cb %>%
    dropestr::CorrectUmiSequenceErrors(umi.probabilities=trimmed$umi.probabilities, collisions.info=trimmed$collisions.info,
                                       correction.info=trimmed$correction.info, mc.cores=mc.cores.small,
                                       verbosity.level=verbosity.level, return='umis', mult=2, method='Classic')

  trimmed$filt_cells <- filt_cells
  return(trimmed)
}

## Trimmed collisions

#' @export
TrimUmisSummary <- function(reads.per.umi.per.cell, trimmed.umi.length, reverse=FALSE) {
  trimmed <- list()
  trimmed$reads.per.umi.per.cb <- lapply(reads.per.umi.per.cell$reads_per_umi, dropestr::TrimUmis,
                                         trimmed.umi.length, reverse=reverse)
  trimmed$umis.per.gene <- sapply(trimmed$reads.per.umi.per.cb, length)

  umi.distribution <- dropestr::GetUmisDistribution(trimmed$reads.per.umi.per.cb, trimmed.umi.length)
  trimmed$umi.probabilities <- umi.distribution / sum(umi.distribution)

  max.umi.per.gene <- max(trimmed$umis.per.gene)
  trimmed$collisions.info <- dropestr::FillCollisionsAdjustmentInfo(trimmed$umi.probabilities, max.umi.per.gene)
  trimmed$neighbours.per.umi <- dropestr::FillAdjacentUmisData(trimmed$umi.probabilities, adjacent_only=T, show_progress=0)[names(trimmed$umi.probabilities)]
  return(trimmed)
}


MaxCharFreq <- function(str) {
  return(max(table(strsplit(str, split='')) / nchar(str)))
}

#' @export
MaxFreqDistribution <- function(prefix.length, umi.distr.tail) {
  substrs <- sapply(names(umi.distr.tail),
                    function(n) substr(n, 1, prefix.length))
  max.frecs <- sapply(substrs, MaxCharFreq)
  observed <- sapply(split(umi.distr.tail, max.frecs), sum) / sum(umi.distr.tail)

  uniform <- sapply(1:10000, function(i) max(table(sample(1:4, size=prefix.length, replace = T)) / prefix.length))
  estimated <- table(uniform) / length(uniform)

  all.fracs <- union(names(estimated), names(observed))
  all.observed <- setNames(rep(0, length(all.fracs)), all.fracs)
  all.observed[names(observed)] <- observed
  all.estimated <- setNames(rep(0, length(all.fracs)), all.fracs)
  all.estimated[names(estimated)] <- estimated
  all.fracs <- as.numeric(all.fracs)

  return(data.frame(observed=c(0, as.vector(all.observed), 0),
                    estimated=c(0, as.vector(all.estimated), 0),
                    fracs=c(min(all.fracs) - 1e-5, all.fracs, max(all.fracs) + 1e-5)))
}

SampleNumOfAdjacentUmis <- function(gene.size, umi.probabilities, uniform=FALSE) {
  if (uniform) {
    umis <- SampleNoReps(gene.size, ids=names(umi.probabilities), probs=NULL)
  } else {
    umis <- SampleNoReps(gene.size, ids=names(umi.probabilities), probs=umi.probabilities)
  }

  rpu <- setNames(rep(1, gene.size), umis)
  return(sum(dropestr::GetAdjacentUmisNum(rpu, rpu)$Total))
}

#' @export
SampleNumbersOfAdjacentUmis <- function(gene.size, umi.probabilities, samples.num, uniform=FALSE) {
  return(sapply(1:samples.num, function(i)
    SampleNumOfAdjacentUmis(gene.size, umi.probabilities, uniform=uniform)))
}
VPetukhov/dropEstAnalysis documentation built on Dec. 28, 2019, 8:16 p.m.