Socrates: Analysis of scATAC-seq data

Documented in projectUMAP reduceDims

###################################################################################################
###################################################################################################
###################################################################################################
#' Reduce dimensions of residual matrix
#'
#' Run SVD on Pearson residuals using IRLBA.
#'
#' @importFrom irlba irlba
#' @importFrom RcppML nmf
#'
#' @param obj list, object containing "pearson_residuals" output by the function regModel.
#' @param method character, string denoting dimension reduction method. Can be one of "SVD" or
#' "NMF". Defaults to SVD.
#' @param n.pcs numeric, number of singular values to calculate.
#' @param scaleVars logical, whether or not to scale PCs by variance explained (or to scale
#' NMF components by scale factors). Default to TRUE.
#' @param num.vars number of highly variable ACRs/bins to use for dimensionality reduction.
#' Variance is stabilized using loess regression between the feature variance and mean. 
#' Defaults to 5000. Set to NULL to use all ACRs/bins. To select features above a specific
#' stabilized variance value, set num.var to a numeric value less than 100. In all cases, 
#' Socrates will take a minimum of 100 features to perform dimensionality reduction.
#' @param regNum number of peaks/bins to use for regularization. regNum must be equal or
#' less than num.vars. Defaults to 5000. 
#' @param cor.max float, maximum spearman correlation between log10nSites (log10 number of
#' accessible peaks) and singular value to keep. Singular values with correlations greater than
#' cor.max are removed. Ranges from 0 to 1. Default set to 0.75.
#' @param doL2 logical, whether or not to L2 normalize barcodes.
#' @param doL1 logical, whether or not to L1 normalize barcodes
#' @param stdLSI logical, whether or not to standardize barcodes.
#' @param refit_residuals logical, whether or not to use quasibinomial logistic regression residuals
#' for num.vars features. Only applicable when num.vars < nrow(obj$counts) and when the normalization
#' was performed with tfidf. Defaults to FALSE.
#' @param residuals_slotName character, character string of the desired residual slotName. Defaults
#' to "residuals".
#' @param svd_slotName character, character string for naming the SVD output in the returned
#' object. Defaults to "PCA".
#' @param verbose logical. Defaults to FALSE.
#' @param ... Additional arguments to be passed to RcppML::nmf
#'
#' @rdname reduceDims
#' @export
reduceDims <- function(obj,
                       method="SVD",
                       n.pcs=50,
                       scaleVar=T,
                       num.var=5000,
		               regNum=5000,
                       cor.max=0.75,
                       doL2=F,
                       doL1=F,
                       doSTD=T,
                       refit_residuals=F,
                       residuals_slotName="residuals",
                       svd_slotName="PCA",
                       verbose=FALSE,
                       ...){
    
    # sub functions
    l2norm <- function(x){x/sqrt(sum(x^2))}
    l1norm <- function(x){x/(sum(x))}
    RowVar <- function(x){
        spm <- t(x)
        stopifnot( methods::is( spm, "dgCMatrix" ) )
        ans <- sapply( base::seq.int(spm@Dim[2]), function(j) {
            if( spm@p[j+1] == spm@p[j] ) { return(0) } # all entries are 0: var is 0
            mean <- base::sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1]
            sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) +
                mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 )
        names(ans) <- spm@Dimnames[[2]]
        ans
    }
    
    # check if slotName exists
    if(is.null(obj[[residuals_slotName]])){
        message(" ERROR: useSlot - ", residuals_slotName, " - is missing ...")
        stop("exiting")
    }
    
    # if use subset
    if(!is.null(num.var)){
        if(verbose){message(" - identifying variable features for clustering ...")}
        if(num.var >= 100){
            row.var <- RowVar(obj[[residuals_slotName]])
            row.means <- Matrix::rowMeans(obj[[residuals_slotName]])
            adj.row.var <- loess(row.var~row.means)$residuals
            names(adj.row.var) <- names(row.var)
            adj.row.var <- adj.row.var[order(adj.row.var, decreasing=T)]
            topSites <- names(head(adj.row.var, n=num.var))
        }else{
            row.var <- RowVar(obj[[residuals_slotName]])
            row.means <- Matrix::rowMeans(obj[[residuals_slotName]])
            adj.row.var <- loess(row.var~row.means)$residuals
            names(adj.row.var) <- names(row.var)
            adj.row.var <- adj.row.var[order(adj.row.var, decreasing=T)]
            topSites <- names(adj.row.var[adj.row.var > num.var])
            if(length(topSites) < 50){
                topSites <- names(adj.row.var[1:100])
            }
        }
        
        # verbose
        if(verbose){message(" - keeping ",length(topSites), " variable features for dimensionality reduction ...")}
        
        # input matrix
        if(refit_residuals & obj$norm_method =="tfidf"){
            if(regNum > length(topSites)){
                regNum <- length(topSites)
            }
            test.dat <- list(counts=obj$counts[topSites,], meta=obj$meta)
            M <- regModel(test.dat, subpeaks=regNum, verbose=verbose)$residuals
            M <- Matrix(t(apply(M, 1, function(x){x - min(x, na.rm=T)})), sparse=T)
            M <- M[Matrix::rowSums(M) > 0,]
        }else{
            M <- obj$residuals[topSites,]
            if(obj$norm_method != "tfidf" & method=="NMF"){
                M <- t(apply(M, 1, function(x){
                    x - min(x, na.rm=T)
                }))
            }
            M <- M[Matrix::rowSums(M) > 0,]
        }
    }else{
        M <- obj[[residuals_slotName]]
    }
    
    # choose method
    if(method=="SVD"){
        
        # set method  used
        obj$rdMethod <- "SVD"
        
        # verbose
        if(verbose){message(" - reduce dimensions with SVD ... ")}
        
        # run
        pcs <- irlba(t(M), nv=n.pcs)
        pc <- pcs$u
        
        # scale by % variance
        if(scaleVar){
            pc <- pc %*% diag(pcs$d)
        }
        pc[is.na(pc)] <- 0
        
    }else if(method=="NMF"){
        
        # set method used
        obj$rdMethod <- "NMF"
        
        # if use subset
        if(verbose){message(" - running NMF...")}
        
        # run NMF
        pcs <- RcppML::nmf(M, n.pcs, verbose=verbose, ...)
        pcs$u <- t(pcs$h)
        pcs$v <- pcs$w
        if(scaleVar){
            pc <- t(pcs$h) %*% Diagonal(x=1/pcs$d)
        }else{
            pc <- t(pcs$h)
        }
        pc[is.na(pc)] <- 0
        
    }
    
    # add colnames
    rownames(pc) <- colnames(obj[[residuals_slotName]])
    colnames(pc) <- paste0("PC_", seq(1:ncol(pc)))
    
    # remove PCs with correlation to read-depth
    if(verbose){message(" - removing components correlated to read depth...")}
    if(cor.max < 1){
        depth <- Matrix::colSums(obj$counts[,rownames(pc)])
        cors <- apply(pc, 2, function(u) cor(u,depth,method="spearman"))
        cors <- abs(cors)
        idx.keep <- cors < cor.max
        pc <- pc[,idx.keep]
    }else{
        idx.keep <- seq(1:ncol(pc))
    }
    
    # standardize, L2, or L1 reduced dimensions per cell
    if(verbose & any(c(doL2, doL1, doSTD))){message(" - normalizing reduced dimensions...")}
    if(doL2){
        pc <- t(apply(pc, 1, l2norm))
    }else if(doL1){
        pc <- t(apply(pc, 1, l1norm))
    }else if(doSTD){
        pc <- t(apply(pc, 1, function(x){(x-mean(x, na.rm=T))/sd(x, na.rm=T)}))
    }
    pc[is.na(pc)] <- 0

    # return
    obj[[svd_slotName]] <- pc
    model_slot <- paste0(svd_slotName, "_model")
    obj[[model_slot]] <- list()
    obj[[model_slot]]$d <- pcs$d
    obj[[model_slot]]$v <- pcs$v
    obj[[model_slot]]$u <- pcs$u
    obj[[model_slot]]$keep_pcs <- idx.keep
    obj$hv_sites <- rownames(M)
    return(obj)
    
    
}


###################################################################################################
###################################################################################################
###################################################################################################
#' Project cells into a reduced embedding with UMAP
#'
#' @importFrom uwot umap
#'
#' @param obj list, object containing 'PCA' for projecting into a reduced embedding with UMAP.
#' @param m.dist numeric, m_dist parameter for uwot::umap. Defaults to 0.1
#' @param k.near numeric, k-nearest neighbors used by the uwot::umap algorithm. Defaults to 15.
#' @param metric character, distance metric used by uwot::umap. Defaults to cosine.
#' @param svd_slotName character, name of desired svd_slotName for running UMAP. Defaults to "PCA".
#' @param umap_slotName character, name of desired umap_slotName for return UMAP results. Defaults
#' to "UMAP".
#' @param verbose, logical. Defaults to FALSE.
#'
#' @rdname projectUMAP
#' @export
projectUMAP <- function(obj,
                        m.dist=0.01,
                        k.near=40,
                        metric="cosine",
                        svd_slotName="PCA",
                        umap_slotName="UMAP",
                        verbose=FALSE,
                        seed=1){

    # checks
    if(is.null(obj[[svd_slotName]])){
        message(" - ERROR: ", svd_slotName, " slot is not present in object")
        stop()
    }

    # verbose
    if(verbose){message(" - non-linear dimensionality reduction with UMAP ...")}

    # run UMAP (uwot implementation)
    set.seed(seed)
    umap.res <- uwot::umap(obj[[svd_slotName]],
                           verbose=verbose,
                           min_dist=m.dist,
                           n_neighbors=k.near,
                           metric=metric,
                           ret_model=T)

    # convert to data frame
    umap.out <- as.data.frame(umap.res$embedding)
    rownames(umap.out) <- rownames(obj[[svd_slotName]])
    colnames(umap.out) <- c("umap1", "umap2")

    # return object
    obj$meta$umap1 <- umap.out[rownames(obj$meta),]$umap1
    obj$meta$umap2 <- umap.out[rownames(obj$meta),]$umap2
    obj[[umap_slotName]] <- umap.out
    obj$UMAP_model <- umap.res
    return(obj)
}