R/mudan.R

Defines functions areColors fac2col plotEmbedding tsneLda clusterBasedBatchCorrect mergePredsList getConfidentPreds predictLds getStableClusters getMarkerGenes markerAuc getClusterGeneInfo getDifferentialGenes modelLda getComMembership getVariableGenes fastPca getPcs bh.adjust normalizeVariance normalizeCounts cleanCounts

Documented in cleanCounts clusterBasedBatchCorrect fastPca getComMembership getConfidentPreds getDifferentialGenes getPcs getStableClusters getVariableGenes mergePredsList modelLda normalizeCounts normalizeVariance plotEmbedding predictLds tsneLda

#' @import stats grDevices graphics
NULL

#' @title Filter a counts matrix
#' @description Filter a counts matrix based on gene (row) and cell (column)
#'      requirements.
#' @param counts A sparse read count matrix. The rows correspond to genes,
#'      columns correspond to individual cells
#' @param min.lib.size Minimum number of genes detected in a cell. Cells with
#'      fewer genes will be removed (default: 300)
#' @param max.lib.size Maximum number of genes detected in a cell. Cells with
#'      more genes will be removed (default: 8000)
#' @param min.reads Minimum number of reads per gene. Genes with fewer reads
#'      will be removed (default: 1)
#' @param min.detected Minimum number of cells a gene must be seen in. Genes
#'      not seen in a sufficient number of cells will be removed (default: 1)
#' @param verbose Verbosity (default: TRUE)
#' @return a filtered read count matrix
#' @examples
#' data(pbmcA)
#' dim(pbmcA)
#' mat <- cleanCounts(pbmcA)
#' dim(mat)
#' @export
#' @importFrom Matrix Matrix colSums rowSums
cleanCounts <- function(
  counts,
  min.lib.size = 300,
  max.lib.size = 8000,
  min.reads    = 1,
  min.detected = 1,
  verbose      = TRUE
) {

  if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
    if(verbose) {
      message('Converting to sparse matrix ...')
    }
    counts <- Matrix::Matrix(counts, sparse = TRUE)
  }

  if (verbose) {
    message('Filtering matrix with ', ncol(counts), ' cells and ', nrow(counts), ' genes ...')
  }

  ## filter out low-gene cells or potential doublets
  ix_col <- Matrix::colSums(counts)
  ix_col <- ix_col > min.lib.size & ix_col < max.lib.size
  counts <- counts[, ix_col]

  ## remove genes that don't have many reads
  counts <- counts[Matrix::rowSums(counts) > min.reads, ]

  ## remove genes that are not seen in a sufficient number of cells
  counts <- counts[Matrix::rowSums(counts > 0) > min.detected, ]

  if (verbose) {
    message('Resulting matrix has ', ncol(counts), ' cells and ', nrow(counts), ' genes')
  }

  return(counts)
}


#' Normalizes counts to CPM
#'
#' Normalizes raw counts to log10 counts per million with pseudocount
#'
#' @param counts Read count matrix. The rows correspond to genes, columns
#'      correspond to individual cells
#' @param depthScale Depth scaling. Using a million for CPM (default: 1e6)
#' @param verbose Verbosity (default: TRUE)
#'
#' @return a normalized matrix
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- normalizeCounts(cd)
#' }
#'
#' @export
#' @importFrom Matrix Matrix colSums t
normalizeCounts <- function(counts, depthScale=1e6, verbose=TRUE) {

  if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
    if(verbose) {
      message('Converting to sparse matrix ...')
    }
    counts <- Matrix::Matrix(counts, sparse = TRUE)
  }

  if (verbose) {
    message('Normalizing matrix with ', ncol(counts), ' cells and ', nrow(counts), ' genes')
  }

  counts <- Matrix::t(Matrix::t(counts) / Matrix::colSums(counts))
  counts <- counts * depthScale

  return(counts)
}


#' Normalize gene expression variance relative to transcriptome-wide expectations
#' (Modified from SCDE/PAGODA2 code)
#'
#' Normalizes gene expression magnitudes to with respect to its ratio to the
#' transcriptome-wide expectation as determined by local regression on expression magnitude
#'
#' @param counts Read count matrix. The rows correspond to genes, columns correspond to individual cells
#' @param gam.k Generalized additive model parameter; the dimension of the basis used to represent the smooth term (default: 5)
#' @param alpha Significance threshold (default: 0.05)
#' @param plot Whether to plot the results (default: FALSE)
#' @param use.unadjusted.pvals If true, will apply BH correction (default: FALSE)
#' @param do.par Whether to adjust par for plotting if plotting (default: TRUE)
#' @param max.adjusted.variance Ceiling on maximum variance after normalization to prevent infinites (default: 1e3)
#' @param min.adjusted.variance Floor on minimum variance after normalization (default: 1e-3)
#' @param verbose Verbosity (default: TRUE)
#' @param details If true, will return data frame of normalization parameters. Else will return normalized matrix.(default: FALSE)
#'
#' @return If details is true, will return data frame of normalization parameters. Else will return normalized matrix.
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' }
#'
#' @importFrom mgcv s
#'
#' @export
#'
normalizeVariance <- function(counts, gam.k=5, alpha=0.05, plot=FALSE, use.unadjusted.pvals=FALSE, do.par=TRUE, max.adjusted.variance=1e3, min.adjusted.variance=1e-3, verbose=TRUE, details=FALSE) {

  if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
    if(verbose) {
      message('Converting to sparse matrix ...')
    }
    counts <- Matrix::Matrix(counts, sparse = TRUE)
  }

  mat <- Matrix::t(counts) ## make rows as cells, cols as genes

  if(verbose) {
    print("Calculating variance fit ...")
  }
  dfm <- log(Matrix::colMeans(mat))
  dfv <- log(apply(mat, 2, var))
  names(dfm) <- names(dfv) <- colnames(mat)
  df <- data.frame(m=dfm, v=dfv)

  vi <- which(is.finite(dfv))

  if(length(vi)<gam.k*1.5) { gam.k=1 } ## too few genes

  if(gam.k<2) {
    if(verbose) {
      print("Using lm ...")
    }
    m <- lm(v ~ m, data = df[vi,])
  } else {
    if(verbose) {
      print(paste0("Using gam with k=", gam.k, "..."))
    }
    fm <- as.formula(sprintf("v ~ s(m, k = %s)", gam.k))
    m <- mgcv::gam(fm, data = df[vi,])
  }
  df$res <- -Inf;  df$res[vi] <- resid(m,type='response')
  n.cells <- ncol(mat)
  n.obs <- nrow(mat)
  df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=F,log.p=T))
  df$lpa <- bh.adjust(df$lp,log=TRUE)
  df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE,log.p=TRUE)/n.cells)

  if(use.unadjusted.pvals) {
    ods <- which(df$lp<log(alpha))
  } else {
    ods <- which(df$lpa<log(alpha))
  }
  if(verbose) {
    print(paste0(length(ods), ' overdispersed genes ... ' ))
  }

  df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
  df$gsf[!is.finite(df$gsf)] <- 0;

  if(plot) {
    if(do.par) {
      par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
    }
    smoothScatter(df$m,df$v,main='',xlab='log10[ magnitude ]',ylab='log10[ variance ]')
    grid <- seq(min(df$m[vi]),max(df$m[vi]),length.out=1000)
    lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
    if(length(ods)>0) {
      points(df$m[ods],df$v[ods],pch='.',col=2,cex=1)
    }
    smoothScatter(df$m[vi],df$qv[vi],xlab='log10[ magnitude ]',ylab='',main='adjusted')
    abline(h=1,lty=2,col=8)
    if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
    points(df$m[ods],df$qv[ods],col=2,pch='.')
  }

  ## variance normalize
  norm.mat <- counts*df$gsf
  if(!details) {
    return(norm.mat)
  } else {
    ## return normalization factor
    return(list(mat=norm.mat, ods=ods, df=df))
  }
}
## BH P-value adjustment with a log option
bh.adjust <- function(x, log = FALSE) {
  nai <- which(!is.na(x))
  ox <- x
  x <- x[nai]
  id <- order(x, decreasing = FALSE)
  if(log) {
    q <- x[id] + log(length(x)/seq_along(x))
  } else {
    q <- x[id]*length(x)/seq_along(x)
  }
  a <- rev(cummin(rev(q)))[order(id)]
  ox[nai] <- a
  ox
}


#' Dimensionality reduction by PCA
#'
#' Dimensionality reduction using PCA by computing principal components using highly variable genes
#'
#' @param mat Variance normalized gene expression matrix.
#' @param nGenes Number of most variable genes. (default: 1000)
#' @param nPcs Number of principal components. (default: 100)
#' @param verbose Verbosity (default: TRUE)
#' @param ... Additional parameters to pass to irlba
#'
#' @return Matrix with columns as cells and rows as principal component eigenvectors.
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' }
#'
#' @export
#'
getPcs <- function(mat, nGenes = min(nrow(mat), 1000), nPcs = 100, verbose=TRUE, ...) {
  if(class(mat)!='matrix') {
    mat <- as.matrix(mat)
  }

  if(verbose) {
    print(paste0('Identifying top ', nPcs, ' PCs on ', nGenes, ' most variable genes ...'))
  }

  ## get variable genes
  if(nGenes < nrow(mat)) {
    vgenes <- getVariableGenes(mat, nGenes)
    mat <- mat[vgenes,]
  }

  ## get PCs
  pcs <- fastPca(t(mat), nPcs, center=TRUE, ...)
  m <- t(pcs$l)
  colnames(m) <- colnames(mat)
  rownames(m) <- paste0('PC', seq_len(nPcs))

  return(t(m))
}
#' Derive principal components by SVD
#'
#' @param m Matrix X
#' @param nPcs Number of principal components
#' @param tol irlba tol
#' @param scale Scale input
#' @param center Center input
#' @param ... Additional parameters to pass to irlba
#'
#' @export
#'
fastPca <- function(m, nPcs=2, tol=1e-10, scale=FALSE, center=TRUE, ...) {
  if(scale||center) {
    m <- scale(m, scale=scale, center=center)
  }
  a <- irlba::irlba(crossprod(m)/(nrow(m)-1), nu=0, nv=nPcs, tol=tol, ...)
  a$l <- m %*% a$v

  return(a)
}
#' Get variable genes
#'
#' @param mat Gene expression matrix
#' @param nGenes Number of genes
#'
#' @export
#'
getVariableGenes <- function(mat, nGenes) {
  vi <- apply(mat, 1, var)
  names(vi) <- rownames(mat)
  vgenes <- names(sort(vi, decreasing=TRUE)[seq_len(nGenes)])
  return(vgenes)
}


#' Get clusters by community detection on approximate nearest neighbors
#'
#' Group cells into clusters based on graph-based community detection on approximate nearest neighbors
#'
#' @param mat Matrix of cells as columns. Features as rows (such as PCs).
#' @param k K-nearest neighbor parameter.
#' @param method Community detection method from igraph. (default: igraph::cluster_walktrap)
#' @param verbose Verbosity (default: TRUE)
#' @param details Whether to return just community annotation or entended details including graph and graph modularity
#'
#' @return Vector of community annotations
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' com <- getComMembership(pcs, k=30)
#' }
#'
#' @export
#'
getComMembership <- function(mat, k, method=igraph::cluster_walktrap, verbose=TRUE, details=FALSE) {
  if(verbose) {
    print("finding approximate nearest neighbors ...")
  }
  knn <- RANN::nn2(mat, k=k)[[1]]

  ## convert to adjacency matrix
  adj <- matrix(0, nrow(mat), nrow(mat))
  rownames(adj) <- colnames(adj) <- rownames(mat)
  invisible(lapply(seq_len(nrow(mat)), function(i) {
    adj[i,rownames(mat)[knn[i,]]] <<- 1
  }))

  ## convert to graph for clustering
  if(verbose) {
    print("calculating clustering ...")
  }
  g <- igraph::graph.adjacency(adj, mode="undirected")
  g <- igraph::simplify(g)
  km <- method(g)
  if(verbose) {
    mod <- igraph::modularity(km)
    if(mod < 0.3) { print('WARNING') }
    print(paste0('graph modularity: ', mod))
  }

  ## community membership
  com <- km$membership
  names(com) <- km$names
  com <- factor(com)

  if(verbose) {
    print("identifying cluster membership ...")
    print(table(com))
  }
  if(details) {
    return(list(com=com, mod=mod, g=g))
  }
  else {
    return(com)
  }
}


#' Get clusters by community detection on approximate nearest neighbors with subsampling
#'
#' Group cells into clusters based on graph-based community detection on approximate nearest neighbors for random subset of cells
#' For when getComMembership takes too long due to there being too many cells
#'
#' @param mat Matrix of cells as columns. Features as rows (such as PCs).
#' @param k K-nearest neighbor parameter.
#' @param nsubsample Number of cells in subset (default: ncol(mat)*0.5)
#' @param seed Random seed for reproducibility
#' @param vote Use neighbor voting system to annotate rest of cells not in subset. If false, will use machine-learning model. (default: FALSE)
#' @param method Community detection method from igraph. (default: igraph::cluster_walktrap)
#' @param verbose Verbosity (default: TRUE)
#'
#' @return Vector of community annotations
#'
#' @examples \dontrun{
#' data(pbmcA)
#' cd <- pbmcA
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' com <- getApproxComMembership(pcs, k=30, getApproxComMembership=1000)
#' }
#'
#' @export
getApproxComMembership <- function (mat, k, nsubsample = nrow(mat) * 0.5, method = igraph::cluster_walktrap,
                                    seed = 0, vote = FALSE, verbose = TRUE)
{
  if (verbose) {
    print(paste0("Subsampling from ", nrow(mat), " cells to ",
                 nsubsample, " ... "))
  }
  set.seed(seed)
  subsample <- sample(rownames(mat), nsubsample)
  if (verbose) {
    print("Identifying cluster membership for subsample ... ")
  }
  pcs.sub <- mat[subsample, ]
  com.sub <- getComMembership(pcs.sub, k = k, method = method)
  if (verbose) {
    print("Imputing cluster membership for rest of cells ... ")
  }
  if (vote) {
    data <- mat[subsample, ]
    query <- mat[setdiff(rownames(mat), subsample), ]
    knn <- RANN::nn2(data, query, k = k)[[1]]
    rownames(knn) <- rownames(query)
    com.nonsub <- unlist(apply(knn, 1, function(x) {
      nn <- rownames(data)[x]
      nn.com <- com.sub[nn]
      return(names(sort(table(nn.com), decreasing = TRUE)[1]))
    }))
    com.all <- factor(c(com.sub, com.nonsub)[rownames(mat)])
  }
  else {
    df.sub <- data.frame(celltype = com.sub, pcs.sub)
    model <- MASS::lda(celltype ~ ., data = df.sub)
    df.all <- data.frame(mat)
    model.output <- stats::predict(model, df.all)
    com.all <- model.output$class
    names(com.all) <- rownames(df.all)
    if (verbose) {
      print("Model accuracy for subsample ...")
      print(table(com.all[names(com.sub)] == com.sub))
    }
  }
  return(com.all)
}

#' Linear discriminant analysis model
#'
#' Identifies components that maximally discriminate among groups using a linear discriminant analysis model
#'
#' @param mat Expression matrix with cells as columns, transferable features such as genes as rows.
#' @param com Community annotations
#' @param verbose Verbosity (default: TRUE)
#' @param nfeatures Number of features (genes) in LDA model (default: all)
#' @param random Wehther those features are random of chosen based on variance (most variable will be chosen by default)
#' @param retest Whether to retest model for accuracy
#'
#' @return LDA model
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' com <- getComMembership(pcs, k=30)
#' model <- modelLda(mat, com)
#' }
#'
#' @export
#'
modelLda <- function(mat, com, nfeatures=nrow(mat), random=FALSE, verbose=TRUE, retest=TRUE) {
  ## filter to reduce feature modeling space
  if(nfeatures < nrow(mat)) {
    if(random) {
      ## random
      set.seed(0)
      vi <- sample(rownames(mat), nfeatures)
      mat <- mat[vi,]
    } else {
      ## most variable (assumes properly variance normalized matrix...fix)
      vi <- getVariableGenes(mat, nfeatures)
      mat <- mat[vi,]
    }
  }

  df <- data.frame(celltype=com, as.matrix(t(mat)))

  if(verbose) {
    print("calculating LDA ...")
  }
  model <- MASS::lda(celltype ~ ., data=df)

  if(retest) {
    ## predict our data based on model
    model.output <- stats::predict(model, df)
    if(verbose) {
      print("LDA prediction accuracy ...")
      print(table(model.output$class==com))
    }
  }

  return(model)
}


#' Differential expression analysis (adapted from PAGODA2)
#'
#' @param cd A read count matrix. The rows correspond to genes, columns correspond to individual cells
#' @param cols Column/cell group annotations. Will perform one vs. all differential expression analysis.
#' @param verbose Verbosity
#'
#' @export
#'
getDifferentialGenes <- function(cd, cols, verbose=TRUE) {
  cm <- t(cd)

  ## match matrix rownames (cells) and group annotations
  if(!all(rownames(cm) %in% names(cols))) { warning("Cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
  ## determine a subset of cells that's in the cols and cols[cell]!=NA
  valid.cells <- rownames(cm) %in% names(cols)[!is.na(cols)];
  if(!all(valid.cells)) {
    ## take a subset of the count matrix
    cm <- cm[valid.cells,]
  }
  ## reorder cols
  cols <- as.factor(cols[match(rownames(cm),names(cols))]);
  cols <- as.factor(cols);

  if(verbose) {
    print(paste0("Running differential expression with ",length(levels(cols))," clusters ... "))
  }

  ## modified from pagoda2
  # run wilcoxon test comparing each group with the rest
  # calculate rank per column (per-gene) average rank matrix
  xr <- apply(cm, 2, function(foo) {
    #foo[foo==0] <- NA
    bar <- rank(foo)
    #bar[is.na(foo)] <- 0
    bar[foo==0] <- 0
    bar
  }); rownames(xr) <- rownames(cm)
  ##xr <- sparse_matrix_column_ranks(cm);

  # calculate rank sums per group
  grs <- do.call(rbind, lapply(levels(cols), function(g) Matrix::colSums(xr[cols==g,])))
  rownames(grs) <- levels(cols); colnames(grs) <- colnames(xr)
  ##grs <- colSumByFac(xr,as.integer(cols))[-1,,drop=F]

  # calculate number of non-zero entries per group
  gnzz <- do.call(rbind, lapply(levels(cols), function(g) Matrix::colSums(xr[cols==g,]>0)))
  rownames(gnzz) <- levels(cols); colnames(gnzz) <- colnames(xr)
  #xr@x <- numeric(length(xr@x))+1
  #gnzz <- colSumByFac(xr,as.integer(cols))[-1,,drop=F]

  #group.size <- as.numeric(tapply(cols,cols,length));
  #group.size <- as.numeric(tapply(cols,cols,length))[1:nrow(gnzz)]; group.size[is.na(group.size)]<-0; # trailing empty levels are cut off by colSumByFac
  group.size <- as.numeric(table(cols))

  # add contribution of zero entries to the grs
  gnz <- (group.size-gnzz)

  # rank of a 0 entry for each gene
  #zero.ranks <- (nrow(xr)-diff(xr@p)+1)/2 # number of total zero entries per gene
  zero.ranks <- apply(cm, 2, function(foo) {
    bar <- rank(foo)
    bar[foo==0][1]
  })
  ustat <- t((t(gnz)*zero.ranks)) + grs - group.size*(group.size+1)/2

  # standardize
  n1n2 <- group.size*(nrow(cm)-group.size);
  # usigma <- sqrt(n1n2*(nrow(cm)+1)/12) # without tie correction
  # correcting for 0 ties, of which there are plenty
  #usigma <- sqrt(n1n2*(nrow(cm)+1)/12)
  usigma <- sqrt((nrow(cm) +1 - (gnz^3 - gnz)/(nrow(cm)*(nrow(cm)-1)))*n1n2/12)
  # standardized U value- z score
  x <- t((ustat - n1n2/2)/usigma);

  # correct for multiple hypothesis
  x <- matrix(qnorm(bh.adjust(pnorm(as.numeric(abs(x)), lower.tail = FALSE,
                                    log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE),
              ncol = ncol(x)) * sign(x)
  rownames(x) <- colnames(cm)
  colnames(x) <- levels(cols)[1:ncol(x)]

  # add fold change information
  log.gene.av <- log2(Matrix::colMeans(cm));
  group.gene.av <- do.call(rbind, lapply(levels(cols), function(g) Matrix::colSums(cm[cols==g,]>0))) / (group.size+1);
  log2.fold.change <- log2(t(group.gene.av)) - log.gene.av;
  # fraction of cells expressing
  f.expressing <- t(gnzz / group.size);
  max.group <- max.col(log2.fold.change)

  if(verbose) {
    print("Summarizing results ... ")
  }

  ## summarize
  ds <- lapply(1:ncol(x),function(i) {
    r <- data.frame(Z=x[,i],M=log2.fold.change[,i],highest=max.group==i,fe=f.expressing[,i])
    rownames(r) <- rownames(x)
    r
  })
  names(ds)<-colnames(x)

  return(ds)
}
## annotate clusters; old; needs major speed improvement
getClusterGeneInfo <- function(mat, com, verbose=FALSE, ncores=10) {

  if(class(com)!='factor') {
    com <- factor(com)
  }
  if(!all(colnames(mat) %in% names(com))) { warning("cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
  ## determine a subset of cells that's in the cols and cols[cell]!=NA
  valid.cells <- colnames(mat) %in% names(com)[!is.na(com)]
  if(!all(valid.cells)) {
    ## take a subset of the count matrix
    mat <- mat[, valid.cells]
  }
  ## reorder cols
  cols <- com
  cols <- as.factor(cols[match(colnames(mat),names(cols))])
  cols <- as.factor(cols)

  ## run simple wilcoxon test comparing each group with the rest
  diffgenes <- lapply(levels(cols), function(g) {
    if(verbose) {
      print(paste0("Testing group ", g, " ... "))
    }

    x <- mat[,cols==g]
    y <- mat[,cols!=g]

    ## aucs and pvalues
    aucs.pvs <- parallel::mclapply(seq_len(nrow(x)), function(i) {
      auc <- markerAuc(x[i,], y[i,])
      pv <- wilcox.test(x[i,], y[i,], alternative="greater")$p.value
      return(list(auc, pv))
    }, mc.cores=ncores)
    aucs <- sapply(aucs.pvs, function(x) x[[1]])
    pvs <- sapply(aucs.pvs, function(x) x[[2]])

    ## assess average fold change
    xm <- rowMeans(x)
    ym <- rowMeans(y)
    fc <- log2(xm/ym)
    ## percent expressing
    pe <- rowSums(x>0)/ncol(x)
    ## z-score
    zs <- abs(qnorm(1 - (pvs/2), lower.tail=FALSE))*sign(fc)

    ## store p value
    df <- data.frame(p.value=pvs, marker.auc=aucs, log2.fold.change=fc, percent.expressing=pe, z.score=zs)
    rownames(df) <- rownames(mat)

    return(df)
  })
  names(diffgenes) <- levels(cols)

  return(diffgenes)
}
## Test how well does a gene discriminates a population
## adapted from seurat package
## too slow; to do
markerAuc <- function(x, y) {
  pred <- ROCR::prediction(
    predictions = c(x, y),
    labels = c(rep(x = 1, length(x)), rep(x = 0, length(y))),
    label.ordering = 0:1
  )
  perf <- ROCR::performance(pred, measure = "auc")
  auc <- perf@y.values[[1]]
  return(auc)
}
## diffGenes is output of getDifferentialGenes
getMarkerGenes <- function(mat, com, diffGenes = NULL, upregulated.only=TRUE, z.threshold=3, auc.threshold=0.5, fe.threshold=0.5, M.threshold=0.1, verbose=TRUE, ncores=1) {

  if(is.null(diffGenes)) {
    ds <- getDifferentialGenes(mat, com)
  } else {
    ds <- diffGenes
  }

  if(upregulated.only) {
    dg <- unique(unlist(lapply(ds, function(x) rownames(x)[x$Z>z.threshold])))
  } else {
    dg <- unique(unlist(lapply(ds, function(x) rownames(x)[abs(x$Z)>z.threshold])))
  }
  dg <- intersect(dg, rownames(mat))
  mat <- mat[dg,]

  ## get info
  df.info <- df.list <- getClusterGeneInfo(mat, com, verbose, ncores)

  ## filter
  genes <- lapply(df.list, function(df) {
    if(upregulated.only) {
      df <- df[df$z.score > z.threshold,]
    } else {
      df <- df[abs(df$z.score) > z.threshold,]
    }
    df <- df[df$marker.auc > auc.threshold,]
    df <- df[df$fe > fe.threshold,]
    df <- df[df$M > M.threshold,]
    return(rownames(df))
  })
  names(genes) <- names(df.list)

  return(list(
    genes=genes,
    info=df.info
  ))
}


#' Iterative merging of clusters along tree until stable
#'
#' @param cd Counts matrix for differential expression analysis. Rows are genes. Columns are cells.
#' @param com Community/group annotations for cells
#' @param matnorm Normalized gene expression matrix for building group relationship tree. Rows are genes. Columns are cells.
#' @param z.threshold Z-score threshold for identifying significantly differentially expressed genes
#' @param hclust.method Hierarchical clustering method used to construct relationship tree
#' @param min.group.size Minimum group size for stable cluster
#' @param min.diff.genes Minimum number of significantly differentially expressed genes that must be identified or else groups will be merged
#' @param max.iter Maximum number of iterations. Will end earlier if convergence reached.
#' @param removeGenes Regex expression matching genes you want to remove such as ribosomal or mitochondrial genes.
#' @param plot Whether to plot intermediate plots (hierarchical clustering dendrograms)
#' @param verbose Verbosity
#'
#' @export
#'
getStableClusters <- function(cd, com, matnorm, z.threshold=3, hclust.method='ward.D', min.group.size=10, min.diff.genes=nrow(cd)*0.005, max.iter=10, removeGenes = "^RP|^MT", verbose=TRUE, plot=FALSE) {

  if(min.group.size>1) {
    com[com %in% levels(com)[table(com)<min.group.size]] <- NA
  }
  com <- as.factor(com[colnames(cd)])

  ## compute differentially expressed genes between two branches of dendrogram
  compare <- function(dend) {
    g1 <- labels(dend[[1]])
    g2 <- labels(dend[[2]])
    incom <- names(com)[com %in% g1]
    outcom <- names(com)[com %in% g2]
    com.sub <- factor(c(
      rep(paste0(g1, collapse="."), length(incom)),
      rep(paste0(g2, collapse="."), length(outcom))))
    names(com.sub) <- c(incom, outcom)
    dg <- getDifferentialGenes(cd[, names(com.sub)], com.sub, verbose=verbose)
    dg.sig <- lapply(dg, function(x) {
      ## get only significantly upregulated
      x <- x[x$Z>z.threshold,]
      x <- na.omit(x)
      return(x)
    })
    if(!is.null(removeGenes)) {
      dg.sig <- lapply(dg.sig, function(dg) {
        dg <- dg[!grepl(removeGenes, rownames(dg)), ] ## exclude ribosomal and mitochondrial
      })
    }
    return(dg.sig)
  }
  ## recursively trace dendrogram
  pv.recur <- function(dend) {
    ## compares leaves of tree (groups)
    if(is.leaf(dend[[1]]) & is.leaf(dend[[2]])) {
      g1 <- paste0(labels(dend[[1]]), collapse=":")
      g2 <- paste0(labels(dend[[2]]), collapse=":")
      if(verbose) {
        print(paste0('Comparing ', g1, ' and ', g2))
      }
      ## if insufficient number of marker genes distinguishing two groups, merge
      dg.sig <- compare(dend)
      if(verbose) {
        print('Differential genes found: ')
        print(sapply(dg.sig, length))
      }
      ## both leaves need markers
      #if(any(sapply(dg.sig, length) < min.diff.genes)) {
      if(length(unlist(dg.sig)) < min.diff.genes) {
        if(verbose) {
          print(paste0('Merging ', g1, ' and ', g2))
        }
        com.fin[com.fin==g1] <<- paste0(g1,'.', g2)
        com.fin[com.fin==g2] <<- paste0(g1,'.', g2)
      }
      pv.sig.all[[g1]] <<- dg.sig
      pv.sig.all[[g2]] <<- dg.sig
    } else {
      ## if only one is leaf, compare to entire other branch
      if(is.leaf(dend[[1]])) {
        g1 <- paste0(labels(dend[[1]]), collapse=".")
        g2 <- paste0(labels(dend[[2]]), collapse=".")
        if(verbose) {
          print(paste0('Comparing ', g1, ' and ', g2))
        }
        dg.sig <- compare(dend)
        if(verbose) {
          print('Differential genes found: ')
          print(sapply(dg.sig, nrow))
        }
        ## if insufficient number of marker genes for leaf, set to NA
        #if(any(sapply(dg.sig, length) < min.diff.genes)) {
        #if(length(unlist(dg.sig)) < min.diff.genes) {
        if(nrow(dg.sig[[labels(dend[[1]])]]) < min.diff.genes) {
          if(verbose) {
            print(paste0('Cannot distinguish ', g1, ' and ', g2))
          }
          com.fin[com.fin==g1] <<- NA
        }
        pv.sig.all[[g1]] <<- dg.sig
      } else {
        pv.recur(dend[[1]])
      }
      if(is.leaf(dend[[2]])) {
        g1 <- paste0(labels(dend[[1]]), collapse=".")
        g2 <- paste0(labels(dend[[2]]), collapse=".")
        if(verbose) {
          print(paste0('Comparing ', g1, ' and ', g2))
        }
        dg.sig <- compare(dend)
        if(verbose) {
          print('Differential genes found: ')
          print(sapply(dg.sig, nrow))
        }
        #if(any(sapply(dg.sig, length) < min.diff.genes)) {
        #if(length(unlist(dg.sig)) < min.diff.genes) {
        if(nrow(dg.sig[[labels(dend[[2]])]]) < min.diff.genes) {
          if(verbose) {
            print(paste0('Cannot distinguish ', g1, ' and ', g2))
          }
          com.fin[com.fin==g2] <<- NA
        }
        pv.sig.all[[g2]] <<- dg.sig
      } else {
        pv.recur(dend[[2]])
      }
    }
  }

  i <- 1 ## counter
  repeat {
    com <- factor(com)
    ## average within groups
    mat.summary <- do.call(cbind, lapply(levels(com), function(ct) {
      cells <- which(com==ct)
      if(length(cells) > 1) {
        Matrix::rowMeans(matnorm[, cells])
      } else {
        matnorm[,cells]
      }
    }))
    colnames(mat.summary) <- levels(com)
    ## cluster groups
    hc <- hclust(dist(t(mat.summary)), method=hclust.method)
    if(plot) { plot(hc) }
    dend <- as.dendrogram(hc)
    com.fin <- as.character(com)
    names(com.fin) <- names(com)
    pv.sig.all <- list()
    pv.recur(dend)
    ## test if converged
    if(sum(com==com.fin, na.rm=TRUE)>=min(sum(!is.na(com)), sum(!is.na(com.fin)))) {
      ## compute one last time
      com.fin <- factor(com.fin)
      mat.summary <- do.call(cbind, lapply(levels(com.fin), function(ct) {
        cells <- which(com.fin==ct)
        if(length(cells) > 1) {
          Matrix::rowMeans(matnorm[, cells])
        } else {
          matnorm[,cells]
        }
      }))
      colnames(mat.summary) <- levels(com.fin)
      hc <- hclust(dist(t(mat.summary)), method=hclust.method)
      if(plot) { plot(hc) }
      dend <- as.dendrogram(hc)
      ## exit
      break
    }
    if(i>max.iter) {
      break
    }
    com <- com.fin
    i <- i+1
  }

  return(list(com=com.fin, pv.sig=pv.sig.all, hc=hc, mat.summary=mat.summary))

}


#' Predict LD embedding for new dataset given old model and gene scale factor
#'
#' @param mat Library-size normalized gene expression matrix
#' @param model LDA model
#' @param gsf Gene scale factor to be applied to mat (so mat must not be variance normalized)
#' @param verbose Verbosity
#'
#' @export
#'
predictLds <- function(mat, model, gsf, verbose=TRUE) {
  gsf.have <- intersect(names(gsf), rownames(mat))
  if(verbose) {
    print(paste0('Percentage of features retained: ', length(gsf.have)/length(gsf)))
  }

  ## fill in with zeros
  mat.test <- matrix(0, length(gsf), ncol(mat))
  rownames(mat.test) <- names(gsf)
  colnames(mat.test) <- colnames(mat)
  mat.test[gsf.have,] <- as.matrix(mat[gsf.have,])

  pred <- stats::predict(model, data.frame(t(log10(mat.test*gsf+1))))
}


#' Use LDA model posteriors to retain only confident predictions
#'
#' @param posterior Posterior probabilities from LDA
#' @param t Posteriors below this threshold are set to NA
#'
#' @export
#'
getConfidentPreds <- function(posterior, t=0.95) {
  class <- apply(posterior, 1, function(x) {
    x <- sort(x, decreasing=TRUE)
    if(x[1]-sum(x[-1]) >= t) {
      return(names(x)[1])
    } else {
      return(NA)
    }
  })
  class <- factor(class)
  return(class)
}


#' Merge a list of group annotations
#'
#' @param pred.com List of group annotations
#' @param min.group.size Minumum group size
#' @param t Percentage increase in entropy that must be achieved to merge annotations
#' @param verbose Verbosity
#'
#' @export
#'
mergePredsList <- function(pred.com, min.group.size=30, t=0.1, verbose=TRUE) {

  ## need to be named to sort
  if(is.null(names(pred.com))) {
    names(pred.com) <- seq_len(length(pred.com))
  }

  ## order from greatest number of distinct groups to least
  order <- sort(sapply(pred.com, function(x) length(table(x))), decreasing=TRUE)
  pred.com <- pred.com[names(order)]

  ## merge only if it increases entropy (adds more information)
  preds.com.init <- pred.com[[1]]
  for(i in 2:length(pred.com)) {
    com.test1 <- preds.com.init
    entropy1 <- entropy::entropy(table(com.test1))

    com.test2 <- pred.com[[i]]
    bar <- cbind(as.character(com.test1), as.character(com.test2[names(com.test1)]))
    rownames(bar) <- names(com.test1)
    bar <- na.omit(bar)
    foo <- apply(bar, 1, paste, collapse=".")
    names(foo) <- rownames(bar)
    foo[foo %in% names(which(table(foo)<min.group.size))] <- NA
    entropy2 <- entropy::entropy(table(foo))

    if(verbose) {
      print("Former entropy:")
      print(entropy1)
      print("New entropy:")
      print(entropy2)
    }
    if((entropy2 - entropy1) > entropy1*t) {
      preds.com.init <<- foo
      if(verbose) {
        print("Merge")
        print(table(foo))
      }
    }
  }
  com.all <- factor(preds.com.init)

  return(com.all)
}


#' Batch correct within identified groups using ComBat
#'
#' @param lds.all Matrix to be batch corrected
#' @param batch Batch factor annotations
#' @param com.final Group annotations
#' @param min.group.size Minimum number of cells in a group in order to batch correct
#' @export
#' @importFrom sva ComBat
clusterBasedBatchCorrect <- function(lds.all, batch, com.final, min.group.size=10) {
  com.final <- factor(com.final)
  nas <- names(which(is.na(com.final)))
  lds.bc <- do.call(rbind, lapply(levels(com.final), function(ct){
    cells <- na.omit(names(com.final)[com.final==ct])
    batch.cells <- factor(batch[cells])
    ## correct only if more than N cells per group
    if(sum(table(batch.cells)>min.group.size)>1) {
      t(sva::ComBat(t(lds.all[cells,]), batch.cells))
    } else {
      lds.all[cells,]
    }
  }))
  lds.bc <- rbind(lds.bc, lds.all[nas,])
  return(lds.bc)
}


#' Run tSNE on LDs from model
#'
#' @param mat Normalized matrix
#' @param model LDA model
#' @param perplexity Perplexity parameter for tSNE
#' @param verbose Verbosity
#' @param plot Whether to plot
#' @param do.par Whether to set plot margins
#' @param ncores Number of cores for paralele tSNE
#' @param details Whether to return details
#' @param ... Additional parameters to pass to plot
#'
#' @export
#'
tsneLda <- function(mat, model, perplexity=30, verbose=TRUE, plot=TRUE, do.par=TRUE, ncores=10, details=FALSE, ...) {
  if(verbose) {
    print('Running LDA models ...')
  }

  ## compute LDs
  matm <- t(as.matrix(mat))
  if(class(model)=='lda') {
    genes.need <- rownames(model$scaling)
    mat.temp <- matrix(0, nrow(matm), length(genes.need))
    rownames(mat.temp) <- rownames(matm)
    colnames(mat.temp) <- genes.need
    genes.have <- intersect(genes.need, colnames(matm))
    mat.temp[rownames(matm), genes.have] <- matm[, genes.have]
    stats::predict(model, data.frame(mat.temp))$x
  }
  if(class(model)=='list') {
    reduction <- do.call(cbind, lapply(model, function(m) {
      genes.need <- rownames(m$scaling)
      mat.temp <- matrix(0, nrow(matm), length(genes.need))
      rownames(mat.temp) <- rownames(matm)
      colnames(mat.temp) <- genes.need
      genes.have <- intersect(genes.need, colnames(matm))
      mat.temp[rownames(matm), genes.have] <- matm[, genes.have]
      stats::predict(m, data.frame(mat.temp))$x
    }))
  }

  if(verbose) {
    print(paste0("Running Rtsne with perplexity ", perplexity))
  }

  ## tSNE
  emb <- Rtsne::Rtsne(reduction, is_distance=FALSE, perplexity=perplexity, verbose=verbose, num_threads=ncores)$Y
  rownames(emb) <- colnames(mat)

  if(plot) {
    if(do.par) {
      par(mar = c(0.5,0.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
    }
    plotEmbedding(emb, ...)
  }

  if(details) {
    return(list(
      emb=emb,
      reduction=reduction
    ))
  } else {
    return(emb)
  }
}


#' Plot 2D embedding
#'
#' @param emb dataframe with x and y coordinates
#' @param groups factor annotations for rows on emb for visualizing cluster annotations
#' @param colors color or numeric values for rows on emb for visualizing gene expression
#' @param cex point size
#' @param alpha point opacity
#' @param gradientPalette palette for colors if numeric values provided
#' @param zlim range for colors
#' @param s saturation of rainbow for group colors
#' @param v value of rainbow for group colors
#' @param min.group.size minimum size of group in order for group to be colored
#' @param show.legend whether to show legend
#' @param mark.clusters whether to mark clusters with name of cluster
#' @param mark.cluster.cex cluster marker point size
#' @param shuffle.colors whether to shuffle group colors
#' @param legend.x legend position ie. 'topright', 'topleft', 'bottomleft', 'bottomright'
#' @param gradient.range.quantile quantile for mapping colors to gradient palette
#' @param verbose verbosity
#' @param unclassified.cell.color cells not included in groups will be labeled in this color
#' @param group.level.colors set group level colors. Default uses rainbow.
#' @param ... Additional parameters to pass to BASE::plot
#'
#' @export
plotEmbedding <- function(emb, groups=NULL, colors=NULL, cex=0.6, alpha=0.4, gradientPalette=NULL, zlim=NULL, s=1, v=0.8, min.group.size=1, show.legend=FALSE, mark.clusters=FALSE, mark.cluster.cex=2, shuffle.colors=F, legend.x='topright', gradient.range.quantile=0.95, verbose=TRUE, unclassified.cell.color='gray70', group.level.colors=NULL, ...) {

  if(!is.null(colors)) {
    ## use clusters information
    if(!all(rownames(emb) %in% names(colors))) { warning("provided cluster vector doesn't list colors for all of the cells; unmatched cells will be shown in gray. ")}
    if(all(areColors(colors))) {
      if(verbose) cat("using supplied colors as is\n")
      cols <- colors[match(rownames(emb),names(colors))]; cols[is.na(cols)] <- unclassified.cell.color;
      names(cols) <- rownames(emb)
    } else {
      if(is.numeric(colors)) { # treat as a gradient
        if(verbose) cat("treating colors as a gradient")
        if(is.null(gradientPalette)) { # set up default gradients
          if(all(sign(colors)>=0)) {
            gradientPalette <- colorRampPalette(c('gray80','red'), space = "Lab")(1024)
          } else {
            gradientPalette <- colorRampPalette(c("blue", "grey70", "red"), space = "Lab")(1024)
          }
        }
        if(is.null(zlim)) { # set up value limits
          if(all(sign(colors)>=0)) {
            zlim <- as.numeric(quantile(colors,p=c(1-gradient.range.quantile,gradient.range.quantile)))
            if(diff(zlim)==0) {
              zlim <- as.numeric(range(colors))
            }
          } else {
            zlim <- c(-1,1)*as.numeric(quantile(abs(colors),p=gradient.range.quantile))
            if(diff(zlim)==0) {
              zlim <- c(-1,1)*as.numeric(max(abs(colors)))
            }
          }
        }
        # restrict the values
        colors[colors<zlim[1]] <- zlim[1]; colors[colors>zlim[2]] <- zlim[2];

        if(verbose) cat(' with zlim:',zlim,'\n')
        colors <- (colors-zlim[1])/(zlim[2]-zlim[1])
        cols <- gradientPalette[colors[match(rownames(emb),names(colors))]*(length(gradientPalette)-1)+1]
        names(cols) <- rownames(emb)
      } else {
        stop("colors argument must be a cell-named vector of either character colors or numeric values to be mapped to a gradient")
      }
    }
  } else {
    if(!is.null(groups)) {
      if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
      groups <- as.factor(groups)[rownames(emb)]
      if(verbose) cat("using provided groups as a factor\n")
      factor.mapping=TRUE;
      ## set up a rainbow color on the factor
      factor.colors <- fac2col(groups,s=s,v=v,shuffle=shuffle.colors,min.group.size=min.group.size,unclassified.cell.color=unclassified.cell.color,level.colors=group.level.colors,return.details=T)
      cols <- factor.colors$colors;
      names(cols) <- rownames(emb)
    } else {
      cols <- rep(unclassified.cell.color, nrow(emb))
      names(cols) <- rownames(emb)
    }
  }

  plot(emb,col=adjustcolor(cols,alpha.f=alpha),cex=cex,pch=19,axes=F, ...); box();
  if(mark.clusters) {
    if(!is.null(groups)) {
      cent.pos <- do.call(rbind,tapply(1:nrow(emb),groups,function(ii) apply(emb[ii,,drop=F],2,median)))
      cent.pos <- na.omit(cent.pos);
      text(cent.pos[,1],cent.pos[,2],labels=rownames(cent.pos),cex=mark.cluster.cex)
    }
  }
  if(show.legend) {
    if(factor.mapping) {
      legend(x=legend.x,pch=rep(19,length(levels(groups))),bty='n',col=factor.colors$palette,legend=names(factor.colors$palette))
    }
  }
}

## a utility function to translate factor into colors
fac2col <- function(x,s=1,v=1,shuffle=FALSE,min.group.size=1,return.details=F,unclassified.cell.color='gray50',level.colors=NULL) {
  x <- as.factor(x);
  if(min.group.size>1) {
    x <- factor(x,exclude=levels(x)[unlist(tapply(rep(1,length(x)),x,length))<min.group.size])
    x <- droplevels(x)
  }
  if(is.null(level.colors)) {
    col <- rainbow(length(levels(x)),s=s,v=v);
  } else {
    col <- level.colors[1:length(levels(x))];
  }
  names(col) <- levels(x);

  if(shuffle) col <- sample(col);

  y <- col[as.integer(x)]; names(y) <- names(x);
  y[is.na(y)] <- unclassified.cell.color;
  if(return.details) {
    return(list(colors=y,palette=col))
  } else {
    return(y);
  }
}

## quick utility to check if given character vector is colors
## thanks, Josh O'Brien: http://stackoverflow.com/questions/13289009/check-if-character-string-is-a-valid-color-representation
areColors <- function(x) {
  is.character(x) &
  sapply(x, function(X) {
    tryCatch(is.matrix(col2rgb(X)), error = function(e) FALSE)
  })
}
JEFworks/MUDAN documentation built on June 19, 2021, 6:46 a.m.