R/getArtificialDoublets.R
In scDblFinder: scDblFinder

Documented in addDoublets createDoublets getArtificialDoublets getCellPairs

#' getArtificialDoublets
#'
#' Create expression profiles of random artificial doublets.
#'
#' @param x A count matrix, with features as rows and cells as columns.
#' @param n The approximate number of doublet to generate (default 3000).
#' @param clusters The optional clusters labels to use to build cross-cluster 
#' doublets.
#' @param resamp Logical; whether to resample the doublets using the poisson
#' distribution. Alternatively, if a proportion between 0 and 1, the proportion
#' of doublets to resample.
#' @param halfSize Logical; whether to half the library size of doublets 
#' (instead of just summing up the cells). Alternatively, a number between 0 
#' and 1 can be given, determining the proportion of the doublets for which
#' to perform the size adjustment.
#' @param adjustSize Logical; whether to adjust the size of the doublets using
#' the ratio between each cluster's median library size. Alternatively, a number
#' between 0 and 1 can be given, determining the proportion of the doublets for
#' which to perform the size adjustment.
#' @param propRandom The proportion of the created doublets that are fully
#'  random (default 0.1); the rest will be doublets created across clusters. 
#'  Ignored if `clusters` is NULL.
#' @param selMode The cell pair selection mode for inter-cluster doublet 
#' generation, either 'uniform' (same number of doublets for each combination),
#' 'proportional' (proportion expected from the clusters' prevalences), or
#' 'sqrt' (roughly the square root of the expected proportion).
#' @param n.meta.cells The number of meta-cell per cluster to create. If given,
#' additional doublets will be created from cluster meta-cells. Ignored if 
#' `clusters` is missing.
#' @param meta.triplets Logical; whether to create triplets from meta cells. 
#' Ignored if `clusters` is missing.
#'
#' @return A list with two elements: `counts` (the count matrix of
#' the artificial doublets) and `origins` the clusters from which each 
#' artificial doublets originated (NULL if `clusters` is not given).
#' 
#' @examples
#' m <- t(sapply( seq(from=0, to=5, length.out=50), 
#'                FUN=function(x) rpois(30,x) ) )
#' doublets <- getArtificialDoublets(m, 30)
#' 
#' @export
getArtificialDoublets <- function( x, n=3000, clusters=NULL,resamp=0.5,
                                   halfSize=0.5, adjustSize=0.2, propRandom=0.1, 
                                   selMode=c("proportional","uniform","sqrt"),
                                   n.meta.cells=2, meta.triplets=TRUE ){
  selMode <- match.arg(selMode)
  ls <- Matrix::colSums(x)
  w <- which(ls>0 & ls>=quantile(ls,0.01) & ls<=quantile(ls,0.99))
  x <- x[,w,drop=FALSE]
  if(!is.null(clusters)){
    if(is.list(clusters)){
      clusters$k <- clusters$k[w]
      clo <- clusters
      clusters <- clusters$k
    }else{
      clusters <- clusters[w]
      if(any(is.na(clusters))){
        w <- which(!is.na(clusters))
        clusters <- clusters[w]
        x <- x[,w]
      }
      clo <- clusters
    }
    clusters <- droplevels(as.factor(clusters))
  }
  
  if(is.null(clusters) || propRandom==1){
    # create random combinations
    if(ncol(x)^2 <= n){
      # few combinations, get them all
      ad <- expand.grid(seq_len(ncol(x)),seq_len(ncol(x)))
    }else{
      ad <- matrix(sample.int(ncol(x), 2*n, replace=(2*n >= ncol(x))),ncol=2)
    }
    # remove doublets of the same cell
    ad <- ad[which(ad[,1]!=ad[,2]),]
    # create doublets
    if(is.null(clusters)){
      ad.m <- createDoublets(x, ad, adjustSize=FALSE, resamp=resamp, 
                             halfSize=halfSize, prefix="rDbl.")
      oc <- NULL
    }else{
      oc1 <- matrix(clusters[as.numeric(ad)],ncol=2)
      ad.m <- createDoublets(x, ad, clusters=clusters, adjustSize=adjustSize, 
                             halfSize=halfSize, resamp=resamp, prefix="rDbl.")
      oc <- paste(oc1[,2],oc1[,1],sep="+")
      oc[which(oc1[,2]==oc1[,1])] <- NA
    }
    row.names(ad.m) <- row.names(x)
    return(list( counts=ad.m, origins=as.factor(oc) ))
  }
  
  if((nr <- ceiling(n*propRandom))>0){
    ad.m <- getArtificialDoublets(x, n=nr, clusters=clusters, 
                                  halfSize=halfSize, resamp=resamp,
                                  propRandom=1, adjustSize=adjustSize)
    oc <- ad.m$origins
    ad.m <- ad.m$counts
    n <- ceiling(n*(1-propRandom))
  }else{
    ad.m <- x[,c(),drop=FALSE]
    oc <- character()
  }

  if(length(unique(clusters))<3) n.meta.cells <- 0
  
  # create doublets across clusters:
  n <- ceiling(n)
  ca <- getCellPairs(clo, n=ifelse(n.meta.cells>0,ceiling(n*0.9),n), 
                     selMode=selMode)
  m2 <- createDoublets(x, ca, clusters=clusters, adjustSize=adjustSize,
                       halfSize=halfSize, resamp=resamp)
  oc <- c(oc, as.character(ca$orig.clusters))
  names(oc) <- names(m2)
  ad.m <- cbind(ad.m, m2)
  rm(m2)
  gc(verbose=FALSE)
  
  if(n.meta.cells>0){
    # create doublets from meta cells:
    meta <- .getMetaCells(x, clusters, n.meta.cells=n.meta.cells, 
                          meta.cell.size=30)
    cl2 <- meta$clusters
    meta <- meta$mu
    ca <- getCellPairs(cl2, n=ceiling(n*0.1))
    m2 <- createDoublets(meta, ca, clusters=cl2, adjustSize=FALSE, 
                         halfSize=FALSE, resamp=TRUE, prefix="artMetaDbl.")
    oc2 <- ca$orig.clusters
    names(oc2) <- colnames(m2)
    ad.m <- cbind(ad.m, m2)
    oc <- c(oc,as.character(oc2))
  }
  pc10 <- length(clusters)/10
  tt <- table(clusters)
  if(meta.triplets && sum(tt>=pc10)>2){
    # get clusters that have more than 10% of the cells
    cl2 <- names(tt)[tt>=pc10]
    # otherwise get the 3 largest clusters
    if(length(cl2)<3) cl2 <- names(sort(tt, decreasing=TRUE))[1:3]
    w <- which(clusters %in% cl2)
    # create triplets from meta cells:
    meta <- .getMetaCells(x[,w], clusters[w], n.meta.cells=1, 
                          meta.cell.size=100)$mu
    i <- seq_len(ncol(meta))
    ca <- expand.grid(i, i, i)
    ca <- ca[ca[,1]<ca[,2] & ca[,2]<ca[,3],,drop=FALSE]
    m2 <- round((meta[,ca[,1],drop=FALSE]+meta[,ca[,2]]+meta[,ca[,3]])/2)
    if(is(ad.m,"dgCMatrix")) m2 <- as(m2,"dgCMatrix")
    oc2 <- rep(NA_character_, ncol(m2))
    names(oc2) <- colnames(m2) <- 
      paste0("artTriplet.", ncol(ad.m)+seq_len(ncol(m2)))
    ad.m <- cbind(ad.m, m2)
    oc <- c(oc, oc2)
  }
  list( counts=ad.m, origins=as.factor(oc) )
}

#' getCellPairs
#'
#' Given a vector of cluster labels, returns pairs of cross-cluster cells
#' 
#' @param x A vector of cluster labels for each cell, or a list containing 
#' metacells and graph
#' @param n The number of cell pairs to obtain
#' @param ... Further arguments, for instance the `k` vector of precluster 
#' labels if `x` is a metacell graph.
#'
#' @return A data.frame with the columns
#' @export
#'
#' @examples
#' # create random labels
#' x <- sample(head(LETTERS), 100, replace=TRUE)
#' getCellPairs(x, n=6)
getCellPairs <- function(x, n=1000, ...){
  if(is(x,"igraph")) return(.getCellPairsFromGraph(x, n=n, ...))
  if(is.list(x) && all(c("graph","k") %in% names(x)))
     return(.getCellPairsFromGraph(x$graph, k=x$k, n=n, ...))
  .getCellPairsFromClusters(x, n=n, ...)
}

# get cross-cluster pairs of cells
.getCellPairsFromClusters <- function(clusters, n=1000, ls=NULL, q=c(0.1,0.9),
                                      selMode="uniform", soft.min=5, ...){
  cli <- split(seq_along(clusters), clusters)
  if(!is.null(ls)){
    ls <- split(ls, clusters)
    for(f in names(cli)){
        qr <- as.numeric(quantile(ls[[f]], prob=q))
        cli[[f]] <- cli[[f]][which(ls[[f]]>=qr[1] & ls[[f]]<=qr[2])]
    }
  }
  ca <- expand.grid(seq_along(cli), seq_along(cli))
  ca <- as.data.frame(ca[ca[,1]<ca[,2],])
  ca$orig <- factor(paste( names(cli)[ca[,1]], names(cli)[ca[,2]], sep="+"))
  if(selMode=="uniform"){
    ca$n <- ceiling(n/nrow(ca))
  }else{
    ed <- getExpectedDoublets(clusters, only.heterotypic=TRUE)
    ed <- ed*n/sum(ed)
    if(selMode=="sqrt") ed <- sqrt(ed)
    ed <- soft.min + ed
    ed <- ceiling(ed*n/sum(ed))
    ca$n <- ed[as.character(ca$orig)]
  }
  ca <- ca[ca$n>0,]
  lvls <- levels(ca$orig)
  ca <- do.call(rbind, lapply( seq_len(nrow(ca)), FUN=function(i){ 
    cbind( cell1=sample(cli[[ca[i,1]]], size=ca[i,4],
                        replace=ca[i,4]>length(cli[[ca[i,1]]])),
           cell2=sample(cli[[ca[i,2]]], size=ca[i,4],
                        replace=ca[i,4]>length(cli[[ca[i,2]]])),
           orig.clusters=rep(ca[i,3],ca[i,4]) )
  }))
  ca <- as.data.frame(ca)
  ca$orig.clusters <- factor(ca$orig.clusters, levels=seq_along(lvls), labels=lvls)
  ca[!duplicated(ca),]
}

# get pairs of cells based on distances on the meta-cell graph
#' @importFrom igraph distances V
.getCellPairsFromGraph <- function(g, k=seq_len(length(V(g))), n=1000, 
                                   weightFun=NULL, ...){
  cli <- split(seq_along(k), k)
  ca <- expand.grid(seq_along(cli), seq_along(cli))
  ca <- as.data.frame(ca[ca[,1]<ca[,2],])
  d <- distances(g)
  d[is.infinite(d)] <- max(d[!is.infinite(d)])
  if(is.null(weightFun)) weightFun <- function(d) sqrt(d)-0.5
  if(is.character(weightFun)) weightFun <- get(weightFun)
  d <- weightFun(d)
  d[is.nan(d) | d<0] <- 0
  ca$dist <- vapply(seq_len(nrow(ca)), FUN.VALUE=numeric(1), 
                    FUN=function(i) d[ca[i,1],ca[i,2]])
  ca$n <- rpois(nrow(ca), n/sum(ca$dist)*ca$dist)
  ca <- ca[ca$n>0,]
  oc <- rep(factor(paste(names(cli)[ca[,1]], names(cli)[ca[,2]], sep="+")),ca$n)
  ca <- do.call(rbind, lapply( seq_len(nrow(ca)), FUN=function(i){ 
    cbind( sample(cli[[ca[i,1]]],size=ca$n[i],replace=TRUE),
           sample(cli[[ca[i,2]]],size=ca$n[i],replace=TRUE) )
  }))
  ca <- data.frame(ca, orig.clusters=oc)
  colnames(ca) <- c("cell1","cell2","orig.clusters")
  ca[!duplicated(ca),]
}


# creates within-cluster meta-cells from a count matrix
.getMetaCells <- function(x, clusters, n.meta.cells=20, meta.cell.size=20){
  if(is.factor(clusters)) clusters <- droplevels(clusters)
  cli <- split(seq_along(clusters), clusters)
  meta <- lapply(cli, FUN=function(x){
    lapply(seq_len(n.meta.cells), FUN=function(y){
      sample(x,min(ceiling(0.6*length(x)),meta.cell.size),replace=FALSE)
    })
  })
  cl2 <- rep(names(cli), lengths(meta))
  meta <- vapply(unlist(meta, recursive=FALSE), FUN.VALUE=double(nrow(x)), 
                 FUN=function(y){ Matrix::rowMeans(x[,y,drop=FALSE]) })
  colnames(meta) <- paste0("metacell.",seq_len(ncol(meta)))
  list(mu=meta, clusters=cl2)
}


#' addDoublets
#'
#' Adds artificial doublets to an existing dataset
#'
#' @param x A count matrix of singlets, or a 
#'  \code{\link[SummarizedExperiment]{SummarizedExperiment-class}}
#' @param clusters A vector of cluster labels for each column of `x`
#' @param dbr The doublet rate
#' @param only.heterotypic Whether to add only heterotypic doublets.
#' @param adjustSize Whether to adjust the library sizes of the doublets.
#' @param prefix Prefix for the colnames generated.
#' @param ... Any further arguments to \code{\link{createDoublets}}.
#'
#' @return A `SingleCellExperiment` with the colData columns `cluster` and 
#' `type` (indicating whether the cell is a singlet or doublet).
#' 
#' @export
#' @examples
#' sce <- mockDoubletSCE(dbl.rate=0)
#' sce <- addDoublets(sce, clusters=sce$cluster)
addDoublets <- function(x, clusters, dbr=(0.01*ncol(x)/1000), 
                        only.heterotypic=TRUE, adjustSize=FALSE, 
                        prefix="doublet.", ...){
  ed <- round(getExpectedDoublets(clusters, dbr=dbr, 
                                  only.heterotypic=only.heterotypic))
  ed <- ed[ed>0]
  if(length(ed)==0) return(x)
  if(is(x, "SingleCellExperiment")) x <- counts(x)
  ca <- getCellPairs(clusters, n=length(ed)*(30+max(ed)), ls=Matrix::colSums(x))
  ca <- split(ca, ca$orig.clusters)
  ca <- do.call(rbind, lapply(names(ed), FUN=function(i){
    ca2 <- ca[[i]]
    ca2[sample.int(nrow(ca2), ed[[i]]),,drop=FALSE]
  }))
  cl <- as.factor(c(as.character(clusters), as.character(ca[,3])))
  m2 <- createDoublets(x, ca, clusters, adjustSize=adjustSize, prefix=prefix)
  cd <- data.frame(type=rep(c("singlet","doublet"),c(ncol(x),ncol(m2))),
                   cluster=cl)
  SingleCellExperiment(list(counts=cbind(x, m2)), colData=cd)
}

#' createDoublets
#' 
#' Creates artificial doublet cells by combining given pairs of cells
#' 
#' @param x A count matrix of real cells
#' @param dbl.idx A matrix or data.frame with pairs of cell indexes stored in 
#' the first two columns.
#' @param clusters An optional vector of cluster labels (for each column of `x`)
#' @param resamp Logical; whether to resample the doublets using the poisson
#' distribution. Alternatively, if a proportion between 0 and 1, the proportion
#' of doublets to resample.
#' @param halfSize Logical; whether to half the library size of doublets 
#' (instead of just summing up the cells). Alternatively, a number between 0 
#' and 1 can be given, determining the proportion of the doublets for which
#' to perform the size adjustment. Ignored if not resampling.
#' @param adjustSize Logical; whether to adjust the size of the doublets using
#' the median sizes per cluster of the originating cells. Requires `clusters` to
#' be given. Alternatively to a logical value, a number between 0 and 1 can be 
#' given, determining the proportion of the doublets for which to perform the 
#' size adjustment.
#' @param prefix Prefix for the colnames generated.
#'
#' @return A matrix of artificial doublets.
#' @export
#' @examples
#' sce <- mockDoubletSCE()
#' idx <- getCellPairs(sce$cluster, n=200)
#' art.dbls <- createDoublets(sce, idx)
createDoublets <- function(x, dbl.idx, clusters=NULL, resamp=0.5, 
                           halfSize=0.5, adjustSize=FALSE, prefix="dbl."){
  adjustSize <- .checkPropArg(as.numeric(adjustSize),FALSE)
  halfSize <- .checkPropArg(as.numeric(halfSize),FALSE)
  resamp <- .checkPropArg(as.numeric(resamp),FALSE)
  if(is(x,"SingleCellExperiment")) x <- counts(x)
  if(adjustSize>1 || adjustSize<0)
      stop("`adjustSize` should be a logical or a number between 0 and 1.")
  if(halfSize>1 || halfSize<0)
      stop("`adjustSize` should be a logical or a number between 0 and 1.")
  wAd <- sample.int(nrow(dbl.idx), size=round(adjustSize*nrow(dbl.idx)))
  wNad <- setdiff(seq_len(nrow(dbl.idx)),wAd)
  x1 <- x[,dbl.idx[wNad,1],drop=FALSE]+x[,dbl.idx[wNad,2],drop=FALSE]
  if(length(wAd)>1){
    if(is.null(clusters)) stop("If `adjustSize=TRUE`, clusters must be given.")
    dbl.idx <- as.data.frame(dbl.idx[wAd,,drop=FALSE])
    ls <- Matrix::colSums(x)
    csz <- vapply(split(ls,clusters), FUN=median, FUN.VALUE=numeric(1))
    dbl.idx$ls.ratio <- ls[dbl.idx[,1]]/(ls[dbl.idx[,1]]+ls[dbl.idx[,2]])
    ls1 <- csz[as.character(clusters[dbl.idx[,1]])]
    ls2 <- csz[as.character(clusters[dbl.idx[,2]])]
    dbl.idx$factor <- (dbl.idx$ls.ratio+ls1/(ls1+ls2))/2
    dbl.idx$factor[dbl.idx$factor>0.8] <- 0.8
    dbl.idx$factor[dbl.idx$factor<0.2] <- 0.2
    dbl.idx$ls <- (ls[dbl.idx[,1]]+ls[dbl.idx[,2]])
    x2 <- x[,dbl.idx[,1]]*dbl.idx$factor+x[,dbl.idx[,2]]*(1-dbl.idx$factor)
    x2 <- tryCatch(x2 %*% diag(dbl.idx$ls/Matrix::colSums(x2)),
                   error=function(e) t(t(x2)/Matrix::colSums(x2)))
    x1 <- cbind(x1,x2)
    rm(x2)
  }
  x <- x1
  rm(x1)
  if(halfSize>0){
    wAd <- sample.int(nrow(dbl.idx), size=ceiling(halfSize*nrow(dbl.idx)))
    if(length(wAd)>0)    x[,wAd] <- x[,wAd]/2
  }
  if(resamp>0){
    if(resamp!=halfSize) wAd <- sample.int(ncol(x), ceiling(resamp*ncol(x)))
    if(length(wAd)>0)
      x[,wAd] <- matrix(as.integer(rpois(nrow(x)*length(wAd), 
                                       as.numeric(as.matrix(x[,wAd])))), 
                         nrow=nrow(x))
  }else{
    x <- round(x)
  }
  x <- as(x,"dgCMatrix")
  colnames(x) <- paste0( prefix, seq_len(ncol(x)) )
  x
}