Socrates: Analysis of scATAC-seq data

Documented in detectDoublets filterDoublets

###################################################################################################
###################################################################################################
###################################################################################################
#' detectDoublets
#'
#' This function estimates doublet likelihood via synthetic doublet creation and enrichment
#' analysis. The underlying code is heavily adapted from ArchR described by Granja and Corces et al.
#' If you use detectDoublets in your analysis, please cite the original ArchR paper.
#'
#' @importFrom nabor knn
#' @importFrom parallel mclapply
#'
#' @param obj Socrates object. For best results, use downstream of the function cleanCells prior to
#' normalization.
#' @param nTrials Numeric. Number of times nSample cells are simulated.
#' @param nSample Numeric. Number of synthetic doublets to create per trial.
#' @param k Numeric. Number of nearest neighbors to search for when estimating doublet enrichment via knn
#' from the narbor package.
#' @param threads Numeric. Number of threads to use for mclapply from the parallel package.
#' @param rdMethod Character. Either "SVD" or "NMF" for dimensionality reduction used previously. Defaults
#' to NULL, using the method specified in obj$rdMethod. 
#' @param svd_slotName Character. Slot name containing reduced dimensions from reduceDims function.
#' Default is PCA. 
#'
#' @rdname detectDoublets
#' @export
#'
detectDoublets <- function(obj=NULL,
                           nTrials=5,
                           nSample=1000,
                           k=10,
                           threads=1,
                           rdMethod=NULL,
                           svd_slotName="PCA"){
    
    # pre-checks
    if(is.null(obj)){
        stop("! Socrates object is required ...")
    }
    model_name <- paste0(svd_slotName, "_model")
    if(is.null(obj[[model_name]])){
        stop(" slot: ", model_name, " does not exist. Please ensure that svd_slotName exists before proceeding...")
    }
    n.pcs <- length(obj[[model_name]]$keep_pcs)
    
    # hidden functions
    .sampleSparseMat <- function(mat = NULL, sampleRatio = 0.5){
        total <- length(mat@x)
        sampleTo <- floor(total * (1-sampleRatio))
        mat@x[sample(seq_len(total), sampleTo)] <- 0
        mat <- drop0(mat)
        mat
    }
    .computeKNN <- function(data=NULL,
                            query=NULL,
                            k=50,
                            includeSelf=FALSE,
                            ...){
        if(is.null(query)){
            query <- data
            searchSelf <- TRUE
        }else{
            searchSelf <- FALSE
        }
        if(searchSelf & !includeSelf){
            knnIdx <- nabor::knn(data = data, query = query, k = k + 1, ...)$nn.idx
            knnIdx <- knnIdx[,-1,drop=FALSE]
        }else{
            knnIdx <- nabor::knn(data = data, query = query, k = k, ...)$nn.idx
        }
        knnIdx
    }
    .projectSVD <- function(sobj,
                            u=NULL,
                            v=NULL,
                            d=NULL,
                            n.pcs=NULL,
                            idx.keep=NULL,
                            normModel="regModel",
                            rdMethod="SVD",
                            sites=rownames(sobj$residuals)){
        
        # run normalization
        if(normModel == 'regModel'){
            sobj <- regModel(sobj, nthreads=threads)
        }else if(normModel == 'regModel2'){
            sobj <- regModel2(sobj, nthreads=threads)
        }else if(normModel == 'tfidf'){
            sobj <- tfidf(sobj)
        }else if(normModel == 'logisticModel'){
            sobj <- logisticModel(sobj, nthreads=threads)
        }
        
        # get V matrix
        #if(rdMethod == "SVD"){
        V <- Matrix::t(sobj$residuals[sites,]) %*% v %*% Matrix::diag(1/d)
        #}else if(rdMethod == "NMF"){
        #    V <- Matrix::t(sobj$residuals) %*% t(v) %*% Matrix::diag(1/d)
        #}
        
        # diagonal
        if(n.pcs > ncol(u)){
            n.pcs <- ncol(u)
        }
        svdDiag <- matrix(0, nrow=n.pcs, ncol=n.pcs)
        diag(svdDiag) <- d
        matSVD <- Matrix::t(svdDiag %*% Matrix::t(V))
        matSVD <- as.matrix(matSVD)
        rownames(matSVD) <- colnames(sobj$residuals)
        colnames(matSVD) <- paste0("PC_",seq_len(ncol(matSVD)))
        matSVD <- matSVD[,idx.keep]
        return(matSVD)
        
    }
    
    
    ##############################################################################
    # Simulate doublets
    ##############################################################################
    
    # specify objects
    sampleRatio1 <- c(0.5)
    sampleRatio2 <- c(0.5)
    mat <- obj$counts
    
    # scale nTrials by number of cells
    nTrials <- nTrials * max( floor(nrow(obj$meta) / nSample), 1 )
    
    # simulated matrix
    set.seed(1)
    message(" - Creating synthetic doublets ...")
    simMat <- mclapply(seq_len(nTrials), function(y){
        outs <- lapply(seq_along(sampleRatio1), function(x){
            idx1 <- sample(seq_len(ncol(mat)), nSample, replace = TRUE)
            idx2 <- sample(seq_len(ncol(mat)), nSample, replace = TRUE)
            simulatedMat <- .sampleSparseMat(mat = mat[,idx1], sampleRatio = sampleRatio1[x]) +
                .sampleSparseMat(mat = mat[,idx2], sampleRatio = sampleRatio2[x])
            simulatedMat@x[simulatedMat@x > 0] <- 1
            b <- data.frame(cellIDs=paste0("sim.",y,'.',seq(1:ncol(simulatedMat))),
                            row.names=paste0("sim.",y,'.',seq(1:ncol(simulatedMat))))
            b$nSites <- Matrix::colSums(simulatedMat)
            b$log10nSites <- log10(b$nSites)
            colnames(simulatedMat) <- rownames(b)
            rownames(simulatedMat) <- rownames(mat)
            simobj <- list(counts=simulatedMat, meta=b)
            simSVD <- .projectSVD(simobj,
                                  u=obj[[model_name]]$u,
                                  v=obj[[model_name]]$v,
                                  d=obj[[model_name]]$d,
                                  n.pcs=n.pcs,
                                  idx.keep=obj[[model_name]]$keep_pcs,
                                  normModel=obj$norm_method,
                                  rdMethod=ifelse(is.null(rdMethod), obj$rdMethod, rdMethod),
                                  sites=obj$hv_sites)
            return(simSVD)
        })
        outs <- do.call(rbind, outs)
    }, mc.cores = threads)
    simMat <- do.call(rbind, simMat)
    simMat <- as.matrix(simMat)
    message(" - Created ", nrow(simMat), " synthetic doublets ...")
    
    # project original
    message(" - Creating original projection ...")
    ogMat <- .projectSVD(obj,
                         u=obj[[model_name]]$u,
                         v=obj[[model_name]]$v,
                         d=obj[[model_name]]$d,
                         n.pcs=n.pcs,
                         idx.keep=obj[[model_name]]$keep_pcs,
                         normModel=obj$norm_method,
                         rdMethod=ifelse(is.null(rdMethod), obj$rdMethod, rdMethod),
                         sites=obj$hv_sites)
    
    # merge
    message(" - Merging synthetic and original cells ...")
    allMat <- rbind(simMat, ogMat)
    #allMat <- t(apply(allMat, 1, function(x){(x-mean(x, na.rm=T))/sd(x, na.rm=T)}))
    num.cells <- nrow(ogMat)
    rm(ogMat)

    allMat.num <- allMat
    class(allMat.num) <- "numeric"
    ind <- apply(allMat.num, 1, function(x) all(is.na(x))) 
    allMat.num <- allMat.num[ !ind, ]

    
    # run UMAP model
    message(" - Projecting to UMAP ...")
    set.seed(1)
    proj.umap <- uwot::umap_transform(X = as.matrix(allMat.num),
                                      model = obj$UMAP_model)
    rownames(proj.umap) <- rownames(allMat)
    colnames(proj.umap) <- c("umap1", "umap2")
    out <- list()
    
    
    ##############################################################################
    # Compute Doublet Scores from SVD Embedding
    ##############################################################################
    message(" - Computing KNN doublets (SVD)...")
    knnDoub <- .computeKNN(allMat[-seq_len(nrow(simMat)),], allMat[seq_len(nrow(simMat)),], k=k)
    knnDoub.svd <- knnDoub
    
    #Compile KNN Sums
    countKnn <- rep(0, num.cells)
    names(countKnn) <- rownames(allMat[-seq_len(nrow(simMat)),])
    tabDoub <- table(as.vector(knnDoub))
    countKnn[as.integer(names(tabDoub))] <- countKnn[as.integer(names(tabDoub))] + tabDoub
    scaleTo <- 10000
    scaleBy <- scaleTo / nrow(num.cells)
    countknn.svd <- countKnn
    
    #P-Values
    pvalBinomDoub <- unlist(lapply(seq_along(countKnn), function(x){
        countKnnx <- round(countKnn[x] * scaleBy)
        sumKnnx <- round(sum(countKnn) * scaleBy)
        pbinom(countKnnx - 1, sumKnnx, 1 / scaleTo, lower.tail = FALSE)
    }))
    
    #Adjust
    padjBinomDoub <- p.adjust(pvalBinomDoub, method = "bonferroni")
    
    #Convert To Scores
    doubletScore <- -log10(pmax(padjBinomDoub, 4.940656e-324))
    doubletEnrich <- (countKnn / sum(countKnn)) / (1 / num.cells)
    doubletEnrich <- 10000 * doubletEnrich / length(countKnn) #Enrichment Per 10000 Cells in Data Set
    
    #Store Results
    out$doubletEnrichSVD <- doubletEnrich
    out$doubletScoreSVD <- doubletScore
    
    
    ##############################################################################
    # Compute Doublet Scores from UMAP Embedding
    ##############################################################################
    message(" - Computing KNN doublets (UMAP)...")
    knnDoub <- .computeKNN(proj.umap[-seq_len(nrow(simMat)),], proj.umap[seq_len(nrow(simMat)),], k=k)
    knnDoub.umap <- knnDoub
    
    #Compile KNN Sums
    countKnn <- rep(0, num.cells)
    names(countKnn) <- rownames(allMat[-seq_len(nrow(simMat)),])
    tabDoub <- table(as.vector(knnDoub))
    countKnn[as.integer(names(tabDoub))] <- countKnn[as.integer(names(tabDoub))] + tabDoub
    scaleTo <- 10000
    scaleBy <- scaleTo / num.cells
    countknn.umap <- countKnn
    
    #P-Values
    pvalBinomDoub <- unlist(lapply(seq_along(countKnn), function(x){
        countKnnx <- round(countKnn[x] * scaleBy)
        sumKnnx <- round(sum(countKnn) * scaleBy)
        pbinom(countKnnx - 1, sumKnnx, 1 / scaleTo, lower.tail = FALSE)
    }))
    
    #Adjust
    padjBinomDoub <- p.adjust(pvalBinomDoub, method = "bonferroni")
    
    #Convert To Scores
    doubletScore <- -log10(pmax(padjBinomDoub, 4.940656e-324))
    doubletEnrich <- (countKnn / sum(countKnn)) / (1 / num.cells)
    doubletEnrich <- 10000 * doubletEnrich / length(countKnn) #Enrichment Per 10000 Cells in Data Set
    
    #Store Results
    out$doubletEnrichUMAP <- doubletEnrich
    out$doubletScoreUMAP <- doubletScore
    out$dSVD <- allMat
    out$dUMAP <- proj.umap
    out$knnDoublet.umap <- knnDoub.umap
    out$knnDoublet.svd <- knnDoub.svd
    out$countknn.umap <- countknn.umap
    out$countknn.svd <- countknn.svd
    obj$doublets <- out
    obj$meta$doubletscore <- obj$doublets$doubletEnrichUMAP[rownames(obj$meta)]
    return(obj)
    
}



###################################################################################################
###################################################################################################
###################################################################################################
#' filterDoublets
#'
#' This function identifies potential doublets. A new column in the meta slot (d.type) denotes
#' the doublet status. Doublets can also be directly filtered from the data set by setting
#' removeDoublets to TRUE.
#'
#'
#' @param obj Socrates object. For best results, use downstream of the function cleanCells prior to
#' normalization.
#' @param filterRatio Numeric. Defaults to 1.5.
#' @param embedding Character string. Which embedding to use for doublet estimation filtering. Defaults
#' to "UMAP".
#' @param libraryVar Character string. Meta data column containing library information. Defaults to
#' NULL, which is specifically for single library/replicate objects. Users with multiple libraries/replicates
#' in the object must set the libraryVar parameter to the column name specifying each cells library ID. 
#' The default column name used by mergeSocratesRDS is "sampleID". 
#' @param verbose Boolean. Defaults to TRUE.
#' @param umap_slotname Character string. Defaults to "UMAP".
#' @param svd_slotname Character string. Defaults to "PCA".
#' @param removeDoublets Boolean. Defaults to FALSE.
#'
#' @rdname filterDoublets
#' @export
#'
filterDoublets <- function(obj=NULL, filterRatio=1.5, embedding="UMAP", libraryVar=NULL,
                           verbose=T, umap_slotname="UMAP", svd_slotname="PCA", removeDoublets=F){
    
    # split by library/replicate
    o.ids <- rownames(obj$meta)
    if(verbose){
        if(!is.null(libraryVar)){
            message(" - libraryVar: ", libraryVar)
        }
    }
    if(is.null(libraryVar)){
        message(" - libraryVar set to NULL: if there are more than one library in the input object, please specify the meta column ID containing library/replicate information.")
        libraryVar <- "temp"
        obj$meta[,libraryVar] <- 1
    }else if(! libraryVar %in% colnames(obj$meta) & !is.null(libraryVar)){
        message(" - The parameter `libraryVar`: ", libraryVar, ", does not exist as a column in the meta data slot ")
        stop(" - Error: please make sure that the parameter `libraryVar` is correctly set or is set to NULL for single-library objects ...")
    }
    obj$meta[,libraryVar] <- as.character(obj$meta[,libraryVar])
    libs <- unique(as.character(obj$meta[,libraryVar]))
    
    # iterate over each library/replicate
    outs <- lapply(libs, function(z){
        
        # subset by library/replicate
        obj.lib <- subset(obj$meta, obj$meta[,libraryVar]==z)
        num.cells <- nrow(obj.lib)
        ids <- rownames(obj.lib)
        
        # estimate # cells to remove
        rm.counts <- floor(filterRatio * ((num.cells^2) / 100000))
        keep.count <- num.cells - rm.counts
        
        # select embedding
        if(embedding == "UMAP"){
            obj.lib$doubletscore <- obj$doublets$doubletEnrichUMAP[rownames(obj.lib)]
        }else{
            obj.lib$doubletscore <- obj$doublets$doubletEnrichSVD[rownames(obj.lib)]
        }
        
        # reorder and call doublets
        obj.lib <- obj.lib[order(obj.lib$doubletscore, decreasing=T),]
        obj.lib$d.type <- c(rep("doublet", rm.counts), rep("singlet", keep.count))
        obj.lib <- obj.lib[ids,]
        return(obj.lib)
    })
    outs <- do.call(rbind, outs)
    
    # filter doublets
    if(removeDoublets){
        if(verbose){message("   * Doublet filtering * Input: cells = ", ncol(obj$counts), " | peaks = ", nrow(obj$counts))}
        num.cells <- nrow(outs)
        obj$pre.doublet.meta <- obj$meta
        obj$meta <- subset(outs, outs$d.type == "singlet")
        obj$counts <- obj$counts[,rownames(obj$meta)]
        obj$counts <- obj$count[Matrix::rowSums(obj$counts)>0,]
        obj$counts <- obj$count[,Matrix::colSums(obj$counts)>0]
        obj[[umap_slotname]] <- obj[[umap_slotname]][colnames(obj$counts),]
        obj[[svd_slotname]] <- obj[[svd_slotname]][colnames(obj$counts),]
        new.num.cells <- ncol(obj$counts)
        ratio <- signif(((1-new.num.cells/num.cells)*100), digits=3)
        if(verbose){message("   * Doublet filtering * Filtered (", ratio, "%) : cells = ", ncol(obj$counts), " | peaks = ", nrow(obj$counts))}
        
    }else{
        obj$meta <- outs[o.ids,]
    }
    
    # remove temp column if necessary
    if(is.null(libraryVar)){
        obj$meta$temp <- NULL
    }
    
    # return object
    return(obj)
}