R/qc.R

Defines functions remove_outliers plot_UMI_dup plot_demultiplex plot_mapping plot_QC_pairs calculate_QC_metrics detect_outlier .qq_outliers_robust

Documented in calculate_QC_metrics detect_outlier plot_demultiplex plot_mapping plot_QC_pairs plot_UMI_dup remove_outliers

#' Detect outliers based on robust linear regression of QQ plot
#' @param x a vector of mahalanobis distance
#' @param df degree of freedom for chi-square distribution
#' @param conf confidence for linear regression
#'
#' @return cell names of outliers
.qq_outliers_robust <- function(x, df, conf) {
  n <- length(x)
  P <- stats::ppoints(n)
  z <- stats::qchisq(P, df=df)
  ord.x <- x[order(x)]
  coef <- stats::coef(MASS::rlm(ord.x ~ z, maxit=2000))
  a <- coef[1]
  b <- coef[2]
  zz <- stats::qnorm(1 - (1 - conf)/2)
  SE <- (b/stats::dchisq(z, df))*sqrt(P*(1 - P)/n)
  fit.value <- a + b*z
  upper <- fit.value + zz*SE
  thr <- max(ord.x[ord.x < upper])
  return(names(x[x > thr]))
}


#' Detect outliers based on QC metrics
#'
#' @description This algorithm will try to find \code{comp} number of components
#'   in quality control metrics using a Gaussian mixture model. Outlier
#'   detection is performed on the component with the most genes detected. The
#'   rest of the components will be considered poor quality cells. More cells
#'   will be classified low quality as you increase \code{comp}.
#
#' @param sce a \code{SingleCellExperiment} object containing QC metrics.
#' @param comp the number of component used in GMM. Depending on the quality of
#'  the experiment.
#' @param sel_col a vector of column names which indicate the columns to use
#'  for QC.
#' By default it will be the statistics generated by `calculate_QC_metrics()`
#' @param type only looking at low quality cells (`low`) or possible
#' doublets (`high`) or both (`both`)
#' @param conf confidence interval for linear regression at lower and
#' upper tails.Usually, this is smaller for lower tail because we hope
#' to pick out more low quality cells than doublets.
#' @param batch whether to perform quality control separately for each batch.
#' Default is FALSE. If set to TRUE then you should have a column called
#' `batch` in the `colData(sce)`.
#'
#' @details detect outlier using Mahalanobis distances
#'
#' @return an updated \code{SingleCellExperiment} object with an `outlier`
#' column in \code{colData}
#'
#' @importFrom stats cov pchisq mahalanobis complete.cases qchisq
#' @importFrom robustbase covMcd
#' @importFrom mclust Mclust mclustBIC
#' @importFrom glue glue glue_collapse
#'
#' @export
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' # the sample qc data already run through function `calculate_QC_metrics`
#' # for a new sce please run `calculate_QC_metrics` before `detect_outlier`
#' sce = detect_outlier(sce)
#' table(QC_metrics(sce)$outliers)
#'
detect_outlier <- function(sce,
                          comp=1,
                          sel_col=NULL,
                          type=c("low", "both", "high"),
                          conf=c(0.9, 0.99),
                          batch=FALSE) {
  type <- match.arg(type)
  sce <- validObject(sce)
  # check format:
  if (is.null(sel_col)) {
    sel_col <- c("number_of_genes", "total_count_per_cell",
                 "non_mt_percent", "non_ERCC_percent")
  }
  if (!all(sel_col %in% colnames(QC_metrics(sce)))) {
    missing <- sel_col[!(sel_col %in% colnames(QC_metrics(sce)))]

    missing_vars <- glue_collapse(glue("'{missing}'"), sep = ",", last = " and ")
    message(glue("the following QC metrics not found in colData from sce: {missing_vars}"))

    if (any(c("number_of_genes", "total_count_per_cell") %in% missing_vars)) {
      stop("the quality control metrics should at least contain the number of
           genes(`number_of_genes`) and counts per cell(`total_count_per_cell`)!")
    }
  }
  qc_cols <- colnames(QC_metrics(sce)) %in% sel_col
  x_all <- as.data.frame(QC_metrics(sce)[, qc_cols])
  x_all$total_count_per_cell <- log2(x_all$total_count_per_cell + 1)

  if (!all(complete.cases(x_all))) {
    stop("NAs found in the selected columns, check the quality control matrix")
  }
  if (is.null(dim(x_all))) {
    QC_metrics(sce)$outliers <- FALSE
    return(sce)
  }
  if (!batch) {
    batch_info <- rep(1, nrow(x_all))
  } else {
    if ("batch" %in% colnames(colData(sce))) {
      batch_info <- colData(sce)$batch
    } else {
      stop("did not find column `batch` in colData(sce).")
    }
  }

  if(any(rowSums(x_all) == 0)){
    stop("Some cells have zero counts. remove these cells before detecting outlier.")
  }else if(any(colSums(x_all) == 0)){
    stop("Some QC metrics are all zeros, check your quality control matrix by QC_metrics(sce).")
  }
  outliers <- rep(FALSE, nrow(x_all))

  for (a_batch in unique(batch_info)) {
    x <- x_all[batch_info == a_batch, ]
    dist <- mahalanobis(x, center=colMeans(x), cov=cov(x))
    keep <- !(dist > qchisq(0.99, ncol(x)))
    mod <- Mclust(x[keep, ],
                  G=comp,
                  modelNames=c("EEE","EVV", "VVV"),
                  verbose=FALSE)

    if (comp == 1) {
      covr <- covMcd(x, alpha=0.7)
      dist <- mahalanobis(x,
                          center=covr$center,
                          cov=covr$cov)
      mean_diff <- sign(t(x) - covr$center)
      QC_sign <- c(-1, 1)[as.factor(apply(mean_diff, 2, function(t) {sum(t) > 0}))]
      neg_dist <- dist[QC_sign == -1]
      pos_dist <- dist[QC_sign == 1]
      if (type == "both") {
        outlier_cells <- .qq_outliers_robust(neg_dist, ncol(x), conf[1])
        outlier_cells <- c(outlier_cells,
                           .qq_outliers_robust(pos_dist, ncol(x), conf[2]))
      } else if (type == "low") {
        outlier_cells <- .qq_outliers_robust(neg_dist, ncol(x), conf[1])
      } else if (type == "high") {
        outlier_cells <- .qq_outliers_robust(pos_dist, ncol(x), conf[2])
      }
    } else {
      ord_fst <- c(1:comp)[order(mod$parameters$mean[1, ], decreasing = TRUE)]
      poor_comp <- ord_fst[2:comp]
      good_comp <- ord_fst[1]
      keep1 <- rep(TRUE, nrow(x))
      keep1[keep][mod$classification %in% poor_comp] <- FALSE
      keep1[!keep] <- FALSE
      sub_x <- x[keep1, ]
      covr <- covMcd(sub_x, alpha=0.7)
      sub_dist <- mahalanobis(sub_x,
                              center=covr$center,
                              cov=covr$cov)

      mean_diff <- sign(t(sub_x) - covr$center)
      QC_sign <- c(-1, 1)[as.factor(apply(mean_diff, 2, function(t) {sum(t) > 0}))]
      neg_dist <- sub_dist[QC_sign == -1]
      pos_dist <- sub_dist[QC_sign == 1]

      outlier_cells <- .qq_outliers_robust(neg_dist, ncol(sub_x), conf[1])
      outlier_cells <- c(outlier_cells,
                         .qq_outliers_robust(pos_dist, ncol(sub_x), conf[2]))
      outlier_cells <- c(outlier_cells, rownames(x[!keep1, ]))

      if (!(type == "both")) {
        mean_diff <- sign(t(x)-mod$parameters$mean[, good_comp])
        QC_sign <- c(-1, 1)[as.factor(apply(mean_diff, 2, function(t) {sum(t) > 0}))]
        is_outlier_cell <- rownames(x) %in% outlier_cells
        if (type == "low") {
          outlier_cells <- rownames(x)[is_outlier_cell & (QC_sign == -1)]
        } else if (type == "high") {
          outlier_cells <- rownames(x)[is_outlier_cell & (QC_sign == 1)]
        }
      }
    }

    if (any(rownames(x) %in% outlier_cells)) {
      outliers[batch_info == a_batch][rownames(x)  %in% outlier_cells] = TRUE
    }
  }
  QC_metrics(sce)$outliers <- as.factor(outliers)
  return(sce)
}


#' Calcuate QC metrics from gene count matrix
#'
#' @param sce a \code{SingleCellExperiment} object containing gene counts
#' @details get QC metrics using gene count matrix.
#' The QC statistics added are \itemize{
#'  \item{number_of_genes} number of genes detected.
#'  \item{total_count_per_cell} sum of read number after UMI deduplication.
#'  \item{non_mt_percent} 1 - percentage of mitochondrial gene counts.
#'  Mitochondrial genes are retrived by GO term GO:0005739
#'  \item{non_ERCC_percent} ratio of exon counts to ERCC counts
#'  \item{non_ribo_percent} 1 - percentage of ribosomal gene counts
#' ribosomal genes are retrived by GO term GO:0005840.
#' }
#' @return an \code{SingleCellExperiment} with updated QC metrics
#'
#' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames rowData rowData<- colData colData<-
#'
#' @export
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' # The sample qc data already run through function `calculate_QC_metrics`.
#' # So we delete these columns and run `calculate_QC_metrics` to get them again:
#' colnames(colnames(QC_metrics(sce)))
#' QC_metrics(sce) = QC_metrics(sce)[,c("unaligned","aligned_unmapped","mapped_to_exon")]
#' sce = calculate_QC_metrics(sce)
#' colnames(QC_metrics(sce))
#'
calculate_QC_metrics <- function(sce) {
  sce = validObject(sce) # check the sce object
  if (!("counts" %in% names(assays(sce)))){
    stop("counts not in names(assays(sce)). Cannot find count data.")
  }

  # get gene number
  gene_number <- colSums(assay(sce,"counts")>0)
  if (all(gene_number == 0)) {
    stop("all genes have zero count. Check the expression matrix.")
  }else{
    if (!("scPipe" %in% names(sce@metadata))){
      sce@metadata[["scPipe"]] = list(QC_cols=c("number_of_genes"))
    }else if (!("QC_cols" %in% names(sce@metadata$scPipe))){
      sce@metadata[["scPipe"]] = list(QC_cols=c("number_of_genes"))
      QC_metrics(sce) = DataFrame(row.names = colnames(sce))
      # create a empty QC metrics if not exists
    }
    QC_metrics(sce)$number_of_genes = gene_number
  }

  # get total count per cell
  QC_metrics(sce)$total_count_per_cell = colSums(assay(sce,"counts"))

  # get ERCC ratio
  if(any(grepl("^ERCC-", rownames(sce)))){
    exon_count = colSums(assay(sce,"counts")[!grepl("^ERCC-", rownames(sce)),])
    ERCC_count = colSums(assay(sce,"counts")[grepl("^ERCC-", rownames(sce)),])
    QC_metrics(sce)$non_ERCC_percent = exon_count/(ERCC_count+exon_count+1e-5)
  }else{
      message("no ERCC spike-in. Skip `non_ERCC_percent`")
    }

  # get mt percentage
  if (!is.na(gene_id_type(sce))) {
    mt_genes <- get_genes_by_GO(returns=gene_id_type(sce),
                                dataset=organism(sce),
                                go=c("GO:0005739"))
    if (length(mt_genes)>0) {
      if (sum(rownames(sce) %in% mt_genes)>1) {
        mt_count <- colSums(assay(sce,"counts")[rownames(sce) %in% mt_genes, ])
        QC_metrics(sce)$non_mt_percent <- (colSums(assay(sce,"counts"))-
                                             mt_count)/(colSums(assay(sce,"counts"))+1e-5)
        # add 1e-5 to make sure they are not NA
      }
    }
  }
  else {
    message("no gene_id_type, skip `non_mt_percent`")
  }

  # get ribosomal percentage
  if (!is.na(gene_id_type(sce))) {
    ribo_genes <- get_genes_by_GO(returns=gene_id_type(sce),
                               dataset=organism(sce),
                               go=c("GO:0005840"))
    if (length(ribo_genes)>0) {
      if (sum(rownames(sce) %in% ribo_genes)>1) {
        ribo_count <- colSums(assay(sce,"counts")[rownames(sce) %in% ribo_genes, ])
        QC_metrics(sce)$non_ribo_percent=(colSums(assay(sce,"counts"))-
                                            ribo_count)/(colSums(assay(sce,"counts"))+0.01)
      }
    }
  }
  else {
    message("no gene_id_type, skip `non_ribo_percent`")
  }
  return(sce)
}


#' Plot GGAlly pairs plot of QC statistics from \code{SingleCellExperiment}
#' object
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param sel_col a vector of column names which indicate the columns to use for
#'   plot. By default it will be the statistics generated by
#'   `calculate_QC_metrics()`
#' @importFrom GGally ggpairs wrap
#' @import ggplot2
#' @export
#' @return a ggplot2 object
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' sce = detect_outlier(sce)
#'
#' plot_QC_pairs(sce)
#'
plot_QC_pairs = function(sce, sel_col=NULL) {
  sce = validObject(sce) # check the sce object
  if (is.null(sel_col)) {
    sel_col = c("number_of_genes", "total_count_per_cell", "non_mt_percent",
                  "non_ERCC_percent", "non_ribo_percent", "outliers")
  }
  if (!any(sel_col %in% colnames(QC_metrics(sce)))){
    stop("`sel_col` not in colnames(QC_metrics(sce))).")
  }
  x = as.data.frame(QC_metrics(sce)[, colnames(QC_metrics(sce)) %in% sel_col])


  if ("outliers" %in% colnames(x)) {
    plot_output <- ggpairs(
      x,
      mapping = ggplot2::aes_string(colour = "outliers"),
      upper = list(continuous = wrap("cor", size=3, hjust=0.8))
    )
  }
  else {
    plot_output <- ggpairs(x)
  }

  return(plot_output)
}

#' Plot mapping statistics for \code{SingleCellExperiment} object.
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param sel_col a vector of column names, indicating the columns to use for
#'   plot. by default it will be the mapping result.
#' @param percentage TRUE to convert the number of reads to percentage
#' @param dataname the name of this dataset, used as plot title
#'
#' @import scales ggplot2
#' @importFrom reshape melt
#' @importFrom stats prcomp reorder
#' @return a ggplot2 object
#' @export
#'
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' plot_mapping(sce,percentage=TRUE,dataname="sc_sample")
#'
plot_mapping <- function(sce,
                        sel_col=NULL,
                        percentage=FALSE,
                        dataname="") {
  sce = validObject(sce) # check the sce object
  if (is.null(sel_col)) {
    sel_col = c("unaligned", "aligned_unmapped", "ambiguous_mapping",
                  "mapped_to_ERCC", "mapped_to_intron", "mapped_to_exon")
  }
  x = as.data.frame(QC_metrics(sce)[, sel_col])


  mapping_stat <- x
  mapping_stat$sample_name <- stats::reorder(rownames(mapping_stat), mapping_stat$mapped_to_exon)
  mapping_stat_prop <- as.data.frame(prop.table(as.matrix(mapping_stat[, sapply(mapping_stat, is.numeric)]), 1))
  mapping_stat_prop$sample_name <- mapping_stat$sample_name
  dat.m <- melt(mapping_stat, id.vars="sample_name")
  dat.m1 <- melt(mapping_stat_prop, id.vars="sample_name")

  if (!percentage) {
    p <- ggplot(dat.m, aes_string(x="sample_name", y="value", fill="variable")) + scale_fill_brewer(palette="Set1")+
      geom_bar(stat="identity", width=1)+
      ylab("number of reads")+
      xlab("cell sorted by number of reads mapped to exon")+
      theme(axis.title.x=element_blank(), axis.text.x=element_blank())

      if (dataname != "") {
        p <- p + ggtitle(paste0("Overall mapping statistics of ", dataname, " (number of reads)"))
      } else {
        p <- p + ggtitle(paste0("Overall mapping statistics (number of reads)"))
      }
  }
  else {
    p <- ggplot(dat.m1, aes_string(x="sample_name", y="value", fill="variable")) + scale_fill_brewer(palette="Set1")+
      geom_bar(stat="identity", width=1)+
      ylab("percentage of reads")+
      xlab("cell sorted by number of reads mapped_to_exon")+
      scale_y_continuous(labels=percent_format())+
      theme(axis.title.x=element_blank(), axis.text.x=element_blank())

      if (dataname != "") {
        p <- p + ggtitle(paste0("Overall mapping statistics of ", dataname, " (percentage)"))
      } else {
        p <- p + ggtitle(paste0("Overall mapping statistics (percentage)"))
      }
  }

  return(p)
}



#' plot_demultiplex
#'
#'@description Plot cell barcode demultiplexing result for the
#'  \code{SingleCellExperiment}. The barcode demultiplexing result is shown
#'  using a barplot, with the bars indicating proportions of total reads.
#'  Barcode matches and mismatches are summarised along with whether or not the
#'  read mapped to the genome. High proportion of genome aligned reads with no
#'  barcode match may indicate barcode integration failure.
#'
#' @param sce a \code{SingleCellExperiment} object
#'
#' @return a ggplot2 bar chart
#' @export
#'
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' plot_demultiplex(sce)
#'
plot_demultiplex = function(sce){
  sce = validObject(sce) # check the sce object
  demultiplex_stat = demultiplex_info(sce)
  demultiplex_stat[,"count"] = demultiplex_stat[,"count"]/sum(demultiplex_stat[,"count"])
  if (is.null(demultiplex_stat)){
    stop("`demultiplex_stat` does not exists in sce. demultiplex_info(sce) == NULL)")
  }
  demultiplex_stat$label_y = demultiplex_stat[,"count"]+0.05
  demultiplex_stat$label_tx = percent(demultiplex_stat[,"count"])
  #kp = demultiplex_stat[,"count"]/sum(demultiplex_stat[,"count"])>0.2
  #label_tx[!kp] = ""

  blank_theme = theme_minimal()+
    theme(
      axis.text.x = element_text(angle = 30, hjust=1,size=12),
      panel.border = element_blank(),
      plot.title=element_text(size=14, face="bold"),
      legend.position="none"
    )

  p = ggplot(data=demultiplex_stat, aes(x=status, y=count,
                                        fill=status))+
    geom_bar(width = 1, stat = "identity")+
    #coord_polar("y", start=0)+
    scale_fill_brewer(palette="Dark2")+
    blank_theme+
    geom_text(aes(y = label_y, label = label_tx))+
    labs(title="Cell barcode demultiplexing statistics", y="percentage",x="")
  return(p)
}


#' Plot UMI duplication frequency
#'
#' @description Plot the UMI duplication frequency.
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param log10_x whether to use log10 scale for x axis
#'
#' @return a line chart of the UMI duplication frequency
#' @export
#'
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' plot_UMI_dup(sce)
#'
plot_UMI_dup = function(sce, log10_x = TRUE){
  sce = validObject(sce) # check the sce object
  tmp = UMI_dup_info(sce)
  tmp = tmp[-nrow(tmp),] # remove 1000
  the_sum = cumsum(tmp)
  for (i in 1:(nrow(tmp)-1)){ # cut the zero counts
    if (the_sum$count[nrow(tmp)] - the_sum$count[i]<10){
      tmp = tmp[1:i,]
      break
    }
  }
  dup_col = ""
  if ("duplication_number" %in% colnames(tmp)){
    dup_col = "duplication_number"
  }else if ("duplication.number" %in% colnames(tmp)){
    dup_col = "duplication.number"
  }else{
    message("cannot find column containing UMI duplication number.")
    return(NULL)
  }
  p = ggplot(data=tmp, aes(x=tmp[, dup_col], y=tmp[,"count"])) +
    geom_smooth(method = "loess",se = FALSE)+theme_bw()
  if (log10_x){
    p = p+scale_x_log10()+
          labs(title="Distribution of UMI duplication numbers(log10).",
               x="log10(UMI duplication number)",
               y="frequency")
  }else{
    p = p+labs(title="Distribution of UMI duplication numbers.",
               x="UMI duplication number",
               y="frequency")
  }
  return(p)
}



#' Remove outliers in \code{SingleCellExperiment}
#'
#' @description Removes outliers flagged by \code{detect_outliers()}
#'
#' @param sce a \code{SingleCellExperiment} object
#' @export
#' @return a \code{SingleCellExperiment} object without outliers
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' sce = detect_outlier(sce)
#' dim(sce)
#' sce = remove_outliers(sce)
#' dim(sce)
#'
remove_outliers <- function(sce) {
  sce = validObject(sce) # check the sce object
  if (!("outliers" %in% colnames(QC_metrics(sce)))) {
    stop("No outlier information. Please run `detect_outlier()` to get the outliers.")
  }
  out_cell = QC_metrics(sce)$outliers == FALSE
  return(sce[, out_cell])
}

Try the scPipe package in your browser

Any scripts or data that you put into this service are public.

scPipe documentation built on Nov. 8, 2020, 8:28 p.m.