SCDC: Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets

Documented in deconv_simple SCDC_basis SCDC_basis_ONE SCDC_prop_ONE_subcl_marker SCDC_qc_ONE

#######################################
####### DECONVOLUTION FUNCTIONS #######
#######################################

############################################
#' Basis Matrix
#' @description Basis matrix construction
#' @name SCDC_basis
#' @param x ExpressionSet object for single cells
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param ct.varname variable name for 'cell types'
#' @param sample variable name for subject/samples
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @return a list of basis matrix, sum of cell-type-specific library size, sample variance matrix, basis matrix by mvw, mvw matrix.
#' @export
SCDC_basis <- function(x, ct.sub = NULL, ct.varname, sample, ct.cell.size = NULL){
  # select only the subset of cell types of interest
  if (is.null(ct.sub)){
    ct.sub <- unique(x@phenoData@data[,ct.varname])
  }
  ct.sub <- ct.sub[!is.na(ct.sub)]
  x.sub <- x[,x@phenoData@data[,ct.varname] %in% ct.sub]
  # qc: remove non-zero genes
  x.sub <- x.sub[rowSums(exprs(x.sub)) > 0,]
  # calculate sample mean & sample variance matrix: genes by cell types
  countmat <- exprs(x.sub)
  ct.id <- droplevels(as.factor(x.sub@phenoData@data[,ct.varname]))
  sample.id <- as.character(x.sub@phenoData@data[,sample])
  ct_sample.id <- paste(ct.id,sample.id, sep = '%')

  mean.mat <- sapply(unique(ct_sample.id), function(id){
    y = as.matrix(countmat[, ct_sample.id %in% id])
    apply(y,1,sum, na.rm = TRUE)/sum(y)
  })
  mean.id <- do.call('rbind',strsplit(unique(ct_sample.id), split = '%'))

  sigma <- sapply(unique(mean.id[, 1]), function(id) {
    y = mean.mat[, mean.id[, 1] %in% id]
    if (is.null(dim(y))){
      res = rep(0, length(y))
      message("Warning: the cell type [", id,"] is only available in at most 1 subject!")
    } else {
      res = apply(y, 1, var, na.rm = TRUE)
    }
    return(res)
  })

  sum.mat2 <- sapply(unique(sample.id), function(sid) {
    sapply(unique(ct.id), function(id) {
      y = as.matrix(countmat[, ct.id %in% id & sample.id %in% sid])
      if (ncol(y)>0){
        out = sum(y)/ncol(y)
      } else {
        out = 0
      }
      return(out)
    })
  })
  rownames(sum.mat2) <- unique(ct.id)
  colnames(sum.mat2) <- unique(sample.id)
  # library size factor calculated from the samples:
  if (is.null(ct.cell.size)){
    sum.mat <- rowMeans(sum.mat2, na.rm = T)
  } else {
    if (is.null(names(ct.cell.size))){
      message("Cell size factor vector requires cell type names...")
      break
    } else {
      sum.mat <- ct.cell.size
    }
  }

  basis <- sapply(unique(mean.id[,1]), function(id){
    z <- sum.mat[mean.id[,1]]
    mean.mat.z <- t(t(mean.mat)*z)
    y = as.matrix(mean.mat.z[,mean.id[,1] %in% id])
    apply(y,1,mean, na.rm = TRUE)
  })

  # weighted basis matrix
  my.max <- function(x, ...) {
    y <- apply(x, 1, max, na.rm = TRUE)
    if (median(y, na.rm = T) == 0){
      outx = y
    }else{
      outx = y/median(y, na.rm = T)
    }
    return(outx)
  }

  # MATCH DONOR, CELLTYPE, GENES!!!!!!!!!!!!!!!!
  var.adj <- sapply(unique(sample.id), function(sid) {
    my.max(sapply(unique(ct.id), function(id) {
      y = countmat[, ct.id %in% id & sample.id %in% sid,
                   drop = FALSE]
      if (ncol(y)>0){
        out = apply(y, 1, var, na.rm = T)
      } else {
        out = rep(0, nrow(y))
      }
      return(out)
    }), na.rm = T)
  })
  colnames(var.adj) <- unique(sample.id)

  # q15 <- apply(var.adj,2,quantile, probs = 0.15, na.rm =T)
  q15 <- apply(var.adj, 2, function(zz) {
    z1 = min(zz[zz > 0])
    z2 = quantile(zz, 0.15, na.rm = T)
    return(max(z1, z2))
  })
  q85 <- apply(var.adj, 2, quantile, probs = 0.85, na.rm = T)
  var.adj.q <- t(apply(var.adj, 1, function(y) {
    y[y < q15] <- q15[y < q15]
    y[y > q85] <- q85[y > q85]
    return(y)
  }))

  var.adj.q <- t(apply(var.adj, 1, function(y){
                        y[y<q15] <- q15[y<q15]
                         y[y>q85] <- q85[y>q85]
                         return(y)}
                      )
                 ) #+ 1e-4

  message("Creating Basis Matrix adjusted for maximal variance weight")
  mean.mat.mvw <- sapply(unique(ct_sample.id), function(id){
    sid = unlist(strsplit(id,'%'))[2]
    y = as.matrix(countmat[, ct_sample.id %in% id])
    yy = sweep(y, 1, sqrt(var.adj.q[,sid]), '/')
    apply(yy,1,sum, na.rm = TRUE)/sum(yy)
  })

  basis.mvw <- sapply(unique(mean.id[,1]), function(id){
    z <- sum.mat[mean.id[,1]]
    mean.mat.z <- t(t(mean.mat.mvw)*z)
    y = as.matrix(mean.mat.z[,mean.id[,1] %in% id])
    apply(y,1,mean, na.rm = TRUE)
  })

  # reorder columns
  basis.mvw <- basis.mvw[,ct.sub]
  sigma <- sigma[, ct.sub]
  basis <- basis[, ct.sub]
  sum.mat <- sum.mat[ct.sub]

  return(list(basis = basis, sum.mat = sum.mat,
              sigma = sigma, basis.mvw = basis.mvw, var.adj.q = var.adj.q))
}

#############################################
#' Basis matrix for single cells from one subject
#' @description Basis matrix construction for single cells from one subject
#' @name SCDC_basis_ONE
#' @param x ExpressionSet object for single cells
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param ct.varname variable name for 'cell types'
#' @param sample variable name for subject/samples
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @return a list of basis matrix, sum of cell-type-specific library size, sample variance matrix, basis matrix by mvw, mvw matrix.
#' @export
SCDC_basis_ONE <- function(x , ct.sub = NULL, ct.varname, sample, ct.cell.size = NULL){
  # select only the subset of cell types of interest
  if (is.null(ct.sub)){
    ct.sub <- unique(x@phenoData@data[,ct.varname])[!is.na(unique(x@phenoData@data[,ct.varname]))]
  }
  ct.sub <- ct.sub[!is.na(ct.sub)]
  x.sub <- x[,x@phenoData@data[,ct.varname] %in% ct.sub]
  # qc: remove non-zero genes
  x.sub <- x.sub[rowSums(exprs(x.sub)) > 0,]
  # calculate sample mean & sample variance matrix: genes by cell types
  countmat <- exprs(x.sub)
  # ct.id <- droplevels(as.factor(x.sub@phenoData@data[,ct.varname]))
  ct.id <- x.sub@phenoData@data[,ct.varname]
  sample.id <- x.sub@phenoData@data[,sample]
  ct_sample.id <- paste(ct.id,sample.id, sep = '%')

  mean.mat <- sapply(unique(ct_sample.id), function(id){
    y = as.matrix(countmat[, ct_sample.id %in% id])
    apply(y,1,sum, na.rm = TRUE)/sum(y)
  })
  mean.id <- do.call('rbind',strsplit(unique(ct_sample.id), split = '%'))

  # by subj, then take avg????
  sum.mat2 <- sapply(unique(sample.id), function(sid) {
    sapply(unique(ct.id), function(id) {
      y = as.matrix(countmat[, ct.id %in% id & sample.id %in%
                               sid])
      if (ncol(y)>0){
        out = sum(y)/ncol(y)
      } else {
        out = 0
      }
      return(out)
    })
  })
  rownames(sum.mat2) <- unique(ct.id)
  colnames(sum.mat2) <- unique(sample.id)
  # sum.mat <- rowMeans(sum.mat2, na.rm = T)
  if (is.null(ct.cell.size)){
    sum.mat <- rowMeans(sum.mat2, na.rm = T)
  } else {
    if (is.null(names(ct.cell.size))){
      message("Cell size factor vector requires cell type names...")
      break
    } else {
      sum.mat <- ct.cell.size
    }
  }

  basis <- sapply(unique(mean.id[,1]), function(id){
    z <- sum.mat[mean.id[,1]]
    mean.mat.z <- t(t(mean.mat)*z)
    # id = unique(mean.id[,1])[1]
    y = as.matrix(mean.mat.z[,mean.id[,1] %in% id])
    apply(y,1,mean, na.rm = TRUE)
  })

  # weighted basis matrix
  my.max <- function(x, ...) {
    y <- apply(x, 1, max, na.rm = TRUE)
    if (median(y, na.rm = T) == 0){
      outx = y
    }else{
      outx = y/median(y, na.rm = T)
    }
    return(outx)
  }

  var.adj <- sapply(unique(sample.id), function(sid) {
    my.max(sapply(unique(ct.id), function(id) {
      y = countmat[, ct.id %in% id & sample.id %in% sid,
                   drop = FALSE]
      if (ncol(y)>0){
        out = apply(y, 1, var, na.rm = T)
      } else {
        out = rep(0, nrow(y))
      }
      return(out)
    }), na.rm = T)
  })
  colnames(var.adj) <- unique(sample.id)

  # q15 <- apply(var.adj,2,quantile, probs = 0.15, na.rm =T)
  q15 <- apply(var.adj, 2, function(zz){
    z1 = min(zz[zz>0])
    z2 = quantile(zz, 0.15, na.rm = T)
    return(max(z1,z2))
  })
  q85 <- apply(var.adj,2,quantile, probs = 0.85, na.rm =T)

  var.adj.q <- as.matrix(apply(var.adj, 1,
                               function(y){y[y<q15] <- q15[y<q15]
                               y[y>q85] <- q85[y>q85]
                               return(y)}) ) #+ 1e-4

  message("Creating Basis Matrix adjusted for maximal variance weight")
  mean.mat.mvw <- sapply(unique(ct_sample.id), function(id){
    y = as.matrix(countmat[, ct_sample.id %in% id])
    yy = sweep(y, 1, sqrt(var.adj.q), '/')
    apply(yy,1,sum, na.rm = TRUE)/sum(yy)
  })

  basis.mvw <- sapply(unique(mean.id[,1]), function(id){
    z <- sum.mat[mean.id[,1]]
    mean.mat.z <- t(t(mean.mat.mvw)*z)
    y = as.matrix(mean.mat.z[,mean.id[,1] %in% id])
    apply(y,1,mean, na.rm = TRUE)
  })

  # reorder columns
  basis.mvw <- basis.mvw[,ct.sub]
  sigma <- NULL # in the one subject case, no variance is calculated.
  basis <- basis[, ct.sub]
  sum.mat <- sum.mat[ct.sub]

  return(list(basis = basis, sum.mat = sum.mat,
              sigma = sigma, basis.mvw = basis.mvw, var.adj.q = var.adj.q))
}

#################################
#' Clustering QC
#' @description Single cells Clustering QC
#' @name SCDC_qc
#' @import pheatmap
#' @param sc.eset ExpressionSet object for single cells
#' @param ct.varname variable name for 'cell type'
#' @param sample variable name for subject/sample
#' @param scsetname the name for the single cell dataset
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param arow annotation of rows for pheatmap. Should be a variable name, like "sample" or "Subject".
#' @param qcthreshold the probability threshold used to filter out questionable cells
#' @param generate.figure logical. If generate the heatmap by pheatmap or not. default is TRUE.
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @return a list including: 1) a probability matrix for each single cell input; 2) a clustering QCed ExpressionSet object; 3) a heatmap of QC result.
#' @export
SCDC_qc <- function (sc.eset, ct.varname, sample, scsetname = "Single Cell",
                   ct.sub, iter.max = 1000, nu = 1e-04, epsilon = 0.001, arow =NULL,
                   qcthreshold = 0.7, generate.figure = T, ct.cell.size = NULL,
                   cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"),
                   ...) {
  sc.basis = SCDC_basis(x = sc.eset, ct.sub = ct.sub, ct.varname = ct.varname, sample = sample, ct.cell.size = ct.cell.size)
  M.S <- sc.basis$sum.mat[ct.sub]
  xsc <- getCPM0(exprs(sc.eset)[rownames(sc.basis$basis.mvw),])
  N.sc <- ncol(xsc)

  m.basis <- sc.basis$basis.mvw[, ct.sub]
  sigma <- sc.basis$sigma[, ct.sub]
  valid.ct <- (colSums(is.na(sigma)) == 0) & (colSums(is.na(m.basis)) ==
                                                0) & (!is.na(M.S))
  if (sum(valid.ct) <= 1) {
    stop("Not enough valid cell type!")
  }
  message(paste("Used", sum(valid.ct), "cell types in deconvolution..."))
  m.basis <- m.basis[, valid.ct]
  M.S <- M.S[valid.ct]
  sigma <- sigma[, valid.ct]

  prop.qc <- NULL
  for (i in 1:N.sc) {
    message("Begin iterative weighted estimation...")
    basis.temp <- m.basis
    xsc.temp <- xsc[, i]
    sigma.temp <- sigma
    ### weighting scheme:
    lm.qc <- nnls::nnls(A=basis.temp,b=xsc.temp)
    delta <- lm.qc$residuals
    wt.gene <- 1/(nu + delta^2 + colSums((lm.qc$x)^2*t(sigma.temp)))
    x.wt <- xsc.temp*sqrt(wt.gene)
    b.wt <- sweep(basis.temp,1,sqrt(wt.gene),"*")
    lm.wt <- nnls::nnls(A=b.wt, b=x.wt)
    prop.wt <- lm.wt$x/sum(lm.wt$x)
    delta <- lm.wt$residuals
    for (iter in 1:iter.max){
      wt.gene <- 1/(nu + delta^2 + colSums((lm.wt$x)^2*t(sigma.temp)))
      x.wt <- xsc.temp*sqrt(wt.gene)
      b.wt <- sweep(basis.temp,1,sqrt(wt.gene),"*")
      lm.wt <- nnls::nnls(A=b.wt, b=x.wt)
      delta.new <- lm.wt$residuals
      prop.wt.new <- lm.wt$x/sum(lm.wt$x)
      if (sum(abs(prop.wt - prop.wt.new) < epsilon )){
        prop.wt <- prop.wt.new
        delta <- delta.new
        message("Converged at iteration ", iter)
        break
      }
      prop.wt <- prop.wt.new
      delta <- delta.new
    }

    prop.qc <- rbind(prop.qc, prop.wt)
  }
  # name col and row
  colnames(prop.qc) <- colnames(m.basis)
  rownames(prop.qc) <- colnames(xsc)
  if (!is.null(arow)){
    df.arow <- data.frame(sc.eset@phenoData@data[,arow])
    rownames(df.arow) <- rownames(sc.eset@phenoData@data)
    colnames(df.arow) <- arow
  } else {
    df.arow <- NULL
  }
  if (generate.figure) {
    heat.anno <- pheatmap::pheatmap(prop.qc, annotation_row = df.arow,
                                    annotation_names_row = FALSE, show_rownames = F,
                                    annotation_names_col = FALSE, cutree_rows = length(ct.sub),
                                    color = cbPalette[2:4], cluster_rows = T, cluster_cols = F)
  } else {
    heat.anno <- NULL
  }

  prop.qc.keep <- rowSums(prop.qc > qcthreshold) ==1 # truncated values -> F or T
  sc.eset.qc <- sc.eset[,prop.qc.keep]
  return(list(prop.qc = prop.qc, sc.eset.qc = sc.eset.qc, heatfig = heat.anno))
}

#################################
#' Clustering QC for single cells from one subject
#' @description Clustering QC for single cells from one subject
#' @name SCDC_qc_ONE
#' @param sc.eset ExpressionSet object for single cells
#' @param ct.varname variable name for 'cell type'
#' @param sample variable name for subject/sample
#' @param scsetname the name for the single cell dataset
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param arow annotation of rows for pheatmap
#' @param qcthreshold the probability threshold used to filter out questionable cells
#' @param generate.figure logical. If generate the heatmap by pheatmap or not. default is TRUE.
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @return a list including: 1) a probability matrix for each single cell input; 2) a clustering QCed ExpressionSet object; 3) a heatmap of QC result.
#' @export
SCDC_qc_ONE <- function(sc.eset, ct.varname, sample, scsetname = "Single Cell",
                  ct.sub, iter.max = 1000, nu = 1e-04, epsilon = 0.001,
                    arow = NULL, weight.basis = F, qcthreshold = 0.7,
                  generate.figure = T, ct.cell.size = NULL,
                  cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"),
                  ...){
  sc.basis <- SCDC_basis_ONE(x = sc.eset, ct.sub = ct.sub, ct.varname = ct.varname, sample = sample, ct.cell.size = ct.cell.size)
  if (weight.basis){
    basis.mvw <- sc.basis$basis.mvw[, ct.sub]
  } else {
    basis.mvw <- sc.basis$basis[, ct.sub]
  }

  xsc <- getCPM0(exprs(sc.eset))
  N.sc <- ncol(xsc)

  ALS.S <- sc.basis$sum.mat[ct.sub]
  valid.ct <- (colSums(is.na(basis.mvw)) == 0) & (!is.na(ALS.S))
  if (sum(valid.ct) <= 1) {
    stop("Not enough valid cell type!")
  }
  message(paste("Used", sum(valid.ct), "cell types in deconvolution..."))
  basis.mvw <- basis.mvw[, valid.ct]
  ALS.S <- ALS.S[valid.ct]
  prop.est.mvw <- NULL

  # prop estimation for each sc sample:
  for (i in 1:N.sc) {
    xsc.i <- xsc[, i]*100 # why times 100 if not normalize???
    gene.use <- intersect(rownames(basis.mvw), names(xsc.i))
    basis.mvw.temp <- basis.mvw[gene.use,]
    xsc.temp <- xsc.i[gene.use]
    message(paste(colnames(xsc)[i], "has common genes", sum(xsc[, i] != 0), "..."))
    # first NNLS:

    lm <- nnls::nnls(A=basis.mvw.temp,b=xsc.temp)
    delta <- lm$residuals
    wt.gene <- 1/(nu + delta^2)
    x.wt <- xsc.temp*sqrt(wt.gene)
    b.wt <- sweep(basis.mvw.temp,1,sqrt(wt.gene),"*")

    lm.wt <- nnls::nnls(A=b.wt, b=x.wt)
    prop.wt <- lm.wt$x/sum(lm.wt$x)
    delta <- lm.wt$residuals

    for (iter in 1:iter.max){
      wt.gene <- 1/(nu + delta^2)
      x.wt <- xsc.temp * sqrt(wt.gene)
      b.wt <- sweep(basis.mvw.temp,1,sqrt(wt.gene),"*")
      lm.wt <- nnls::nnls(A=b.wt, b=x.wt)
      delta.new <- lm.wt$residuals
      prop.wt.new <- lm.wt$x/sum(lm.wt$x)

      if (sum(abs(prop.wt.new - prop.wt)) < epsilon){
        prop.wt <- prop.wt.new
        delta <- delta.new
        message("WNNLS Converged at iteration ", iter)
        break
      }
      prop.wt <- prop.wt.new
      delta <- delta.new
    }

    prop.est.mvw <- rbind(prop.est.mvw, prop.wt)
  }
  colnames(prop.est.mvw) <- colnames(basis.mvw)
  rownames(prop.est.mvw) <- colnames(xsc)

  ### plot steps:
 if (generate.figure){
   heat.anno <- pheatmap(prop.est.mvw, annotation_row = arow,
                         annotation_names_row=FALSE, show_rownames = F,
                         annotation_names_col=FALSE, cutree_rows = length(ct.sub),
                         color = cbPalette[2:4],
                         cluster_rows = T, cluster_cols = F) #, main = scsetname
 } else {
   heat.anno <- NULL
 }

  prop.qc.keep <- rowSums(prop.est.mvw > qcthreshold) ==1 # truncated values -> F or T
  sc.eset.qc <- sc.eset[,prop.qc.keep]
  return(list(prop.qc = prop.est.mvw, sc.eset.qc = sc.eset.qc, heatfig = heat.anno))
}

######################################
#' Proportion estimation
#' @description Proportion estimation function for multi-subject case
#' @name SCDC_prop
#' @param bulk.eset ExpressionSet object for bulk samples
#' @param sc.eset ExpressionSet object for single cell samples
#' @param ct.varname variable name for 'cell types'
#' @param sample variable name for subject/samples
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param truep true cell-type proportions for bulk samples if known
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @param Transform_bisque The bulk sample transformation from bisqueRNA. Aiming to reduce the systematic difference between single cells and bulk samples.
#' @return Estimated proportion, basis matrix, predicted gene expression levels for bulk samples
#' @export
SCDC_prop <- function (bulk.eset, sc.eset, ct.varname, sample, ct.sub, iter.max = 1000,
                       nu = 1e-04, epsilon = 0.001, truep = NULL, weight.basis = T,
                       ct.cell.size = NULL, Transform_bisque = F, ...)
{
  bulk.eset <- bulk.eset[rowSums(exprs(bulk.eset)) > 0, , drop = FALSE]
  ct.sub <- intersect(ct.sub, unique(sc.eset@phenoData@data[,
                                                            ct.varname]))
  sc.basis <- SCDC_basis(x = sc.eset, ct.sub = ct.sub, ct.varname = ct.varname,
                         sample = sample, ct.cell.size = ct.cell.size)
  commongenes <- intersect(rownames(sc.basis$basis.mvw), rownames(bulk.eset))
  if (length(commongenes) < 0.2 * min(dim(sc.eset)[1], dim(bulk.eset)[1])) {
    stop("Too few common genes!")
  }
  message(paste("Used", length(commongenes), "common genes..."))
  if (weight.basis) {
    basis.mvw <- sc.basis$basis.mvw[commongenes, ct.sub]
  }
  else {
    basis.mvw <- sc.basis$basis[commongenes, ct.sub]
  }
  # link to bisqueRNA, bulk transformation method. https://github.com/cozygene/bisque
  if (Transform_bisque) {
    GenerateSCReference <- function(sc.eset, ct.sub, ct.varname) {
      cell.labels <- base::factor(sc.eset[[ct.varname]])
      all.cell.types <- base::levels(cell.labels)
      aggr.fn <- function(ct.sub) {
        base::rowMeans(Biobase::exprs(sc.eset)[,cell.labels == ct.sub, drop=F])
      }
      template <- base::numeric(base::nrow(sc.eset))
      sc.ref <- base::vapply(all.cell.types, aggr.fn, template)
      return(sc.ref)
    }
    sc.ref <- GenerateSCReference(sc.eset, ct.sub, ct.varname)[commongenes, , drop = F]
    ncount <- table(sc.eset@phenoData@data[, sample], sc.eset@phenoData@data[, ct.varname])
    true.prop <- ncount/rowSums(ncount, na.rm = T)
    sc.props <- round(true.prop[complete.cases(true.prop), ], 2)
    Y.train <- sc.ref %*% t(sc.props[, colnames(sc.ref)])
    X.pred <- exprs(bulk.eset)[commongenes, ]
    sample.names <- base::colnames(Biobase::exprs(bulk.eset))
    template <- base::numeric(base::length(sample.names))
    base::names(template) <- sample.names
    SemisupervisedTransformBulk <- function(gene, Y.train, X.pred) {
      Y.train.scaled <- base::scale(Y.train[gene, , drop = T])
      Y.center <- base::attr(Y.train.scaled, "scaled:center")
      Y.scale <- base::attr(Y.train.scaled, "scaled:scale")
      n <- base::length(Y.train.scaled)
      shrink.scale <- base::sqrt(base::sum((Y.train[gene, , drop = T] - Y.center)^2)/n + 1)
      X.pred.scaled <- base::scale(X.pred[gene, , drop = T])
      Y.pred <- base::matrix((X.pred.scaled * shrink.scale) +
                               Y.center, dimnames = base::list(base::colnames(X.pred),
                                                               gene))
      return(Y.pred)
    }
    Y.pred <- base::matrix(base::vapply(X = commongenes,
                                        FUN = SemisupervisedTransformBulk, FUN.VALUE = template,
                                        Y.train, X.pred, USE.NAMES = TRUE), nrow = base::length(sample.names))
    indices <- base::apply(Y.pred, MARGIN = 2, FUN = function(column) {
      base::anyNA(column)
    })
    if (base::any(indices)) {
      if (sum(!indices) == 0) {
        base::stop("Zero genes left for decomposition.")
      }
      Y.pred <- Y.pred[, !indices, drop = F]
      sc.ref <- sc.ref[!indices, , drop = F]
    }

    results <- base::as.matrix(base::apply(Y.pred, 1, function(b) {
      sol <- lsei::pnnls(sc.ref, b, sum = 1)
      return(sol$x)
    }))

    prop.est.mvw <- t(results)
    colnames(prop.est.mvw) <- colnames(sc.ref)
    rownames(prop.est.mvw) <- colnames(bulk.eset)
    yhat <- sc.ref %*% results
    colnames(yhat) <- colnames(bulk.eset)
    yobs <- exprs(bulk.eset)
    yeval <- SCDC_yeval(y = yobs, yest = yhat, yest.names = c("SCDC"))
    peval <- NULL
    if (!is.null(truep)) {
      peval <- SCDC_peval(ptrue = truep, pest = prop.est.mvw,
                          pest.names = c("SCDC"), select.ct = ct.sub)
    }
  } else {
    xbulk <- getCPM0(exprs(bulk.eset)[commongenes, , drop = F])
    sigma <- sc.basis$sigma[commongenes, ct.sub]
    ALS.S <- sc.basis$sum.mat[ct.sub]
    N.bulk <- ncol(bulk.eset)
    valid.ct <- (colSums(is.na(sigma)) == 0) & (colSums(is.na(basis.mvw)) ==
                                                  0) & (!is.na(ALS.S))
    if (sum(valid.ct) <= 1) {
      stop("Not enough valid cell type!")
    }
    message(paste("Used", sum(valid.ct), "cell types in deconvolution..."))
    basis.mvw <- basis.mvw[, valid.ct]
    ALS.S <- ALS.S[valid.ct]
    sigma <- sigma[, valid.ct]
    prop.est.mvw <- NULL
    yhat <- NULL
    yhatgene.temp <- rownames(basis.mvw)
    for (i in 1:N.bulk) {
      basis.mvw.temp <- basis.mvw
      xbulk.temp <- xbulk[, i]*100
      sigma.temp <- sigma
      message(paste(colnames(xbulk)[i], "has common genes",
                    sum(xbulk[, i] != 0), "..."))
      lm <- nnls::nnls(A = basis.mvw.temp, b = xbulk.temp)
      delta <- lm$residuals
      wt.gene <- 1/(nu + delta^2 + colSums((lm$x * ALS.S)^2 *
                                             t(sigma.temp)))
      x.wt <- xbulk.temp * sqrt(wt.gene)
      b.wt <- sweep(basis.mvw.temp, 1, sqrt(wt.gene), "*")
      lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
      prop.wt <- lm.wt$x/sum(lm.wt$x)
      delta <- lm.wt$residuals
      for (iter in 1:iter.max) {
        wt.gene <- 1/(nu + delta^2 + colSums((lm.wt$x * ALS.S)^2 *
                                               t(sigma.temp)))
        x.wt <- xbulk.temp * sqrt(wt.gene)
        b.wt <- sweep(basis.mvw.temp, 1, sqrt(wt.gene), "*")
        lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
        delta.new <- lm.wt$residuals
        prop.wt.new <- lm.wt$x/sum(lm.wt$x)
        if (sum(abs(prop.wt.new - prop.wt)) < epsilon) {
          prop.wt <- prop.wt.new
          delta <- delta.new
          R2 <- 1 - var(xbulk.temp - basis.mvw.temp %*%
                          as.matrix(lm.wt$x))/var(xbulk.temp)
          message("WNNLS Converged at iteration ",
                  iter)
          break
        }
        prop.wt <- prop.wt.new
        delta <- delta.new
      }
      R2 <- 1 - var(xbulk.temp - basis.mvw.temp %*% as.matrix(lm.wt$x))/var(xbulk.temp)
      prop.est.mvw <- rbind(prop.est.mvw, prop.wt)
      yhat.temp <- basis.mvw.temp %*% as.matrix(lm.wt$x)
      yhatgene.temp <- intersect(rownames(yhat.temp), yhatgene.temp)
      yhat <- cbind(yhat[yhatgene.temp, ], yhat.temp[yhatgene.temp,
      ])
    }
    colnames(prop.est.mvw) <- colnames(basis.mvw)
    rownames(prop.est.mvw) <- colnames(xbulk)
    colnames(yhat) <- colnames(xbulk)
    yobs <- exprs(bulk.eset)
    yeval <- SCDC_yeval(y = yobs, yest = yhat, yest.names = c("SCDC"))
    peval <- NULL
    if (!is.null(truep)) {
      peval <- SCDC_peval(ptrue = truep, pest = prop.est.mvw,
                          pest.names = c("SCDC"), select.ct = ct.sub)
    }
  }

  return(list(prop.est.mvw = prop.est.mvw, basis.mvw = basis.mvw,
              yhat = yhat, yeval = yeval, peval = peval))
}

############################################
#' Proportion estimation function for one-subject case
#' @description Proportion estimation function for one-subject case
#' @name SCDC_prop_ONE
#' @param bulk.eset ExpressionSet object for bulk samples
#' @param sc.eset ExpressionSet object for single cell samples
#' @param ct.varname variable name for 'cell types'
#' @param sample variable name for subject/samples
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param truep true cell-type proportions for bulk samples if known
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @return Estimated proportion, basis matrix, predicted gene expression levels for bulk samples
#' @export
SCDC_prop_ONE <- function (bulk.eset, sc.eset, ct.varname, sample, truep = NULL,
                           ct.sub, iter.max = 2000, nu = 1e-10, epsilon = 0.001, weight.basis = T,
                           ct.cell.size = NULL,
                           ...) {
  bulk.eset <- bulk.eset[rowSums(exprs(bulk.eset)) > 0, , drop = FALSE]
  sc.basis <- SCDC_basis_ONE(x = sc.eset, ct.sub = ct.sub,
                             ct.varname = ct.varname, sample = sample, ct.cell.size = ct.cell.size)
  if (weight.basis) {
    basis <- sc.basis$basis.mvw
  }
  else {
    basis <- sc.basis$basis
  }
  commongenes <- intersect(rownames(basis), rownames(bulk.eset))
  if (length(commongenes) < 0.2 * min(dim(sc.eset)[1], dim(bulk.eset)[1])) {
    stop("Too few common genes!")
  }
  message(paste("Used", length(commongenes), "common genes..."))
  basis.mvw <- basis[commongenes, ct.sub]
  xbulk <- getCPM0(exprs(bulk.eset)[commongenes, ])
  ALS.S <- sc.basis$sum.mat[ct.sub]
  N.bulk <- ncol(bulk.eset)
  valid.ct <- (colSums(is.na(basis.mvw)) == 0) & (!is.na(ALS.S))
  if (sum(valid.ct) <= 1) {
    stop("Not enough valid cell type!")
  }
  message(paste("Used", sum(valid.ct), "cell types in deconvolution..."))
  basis.mvw <- basis.mvw[, valid.ct]
  ALS.S <- ALS.S[valid.ct]
  prop.est.mvw <- NULL
  yhat <- NULL
  yhatgene.temp <- rownames(basis.mvw)
  for (i in 1:N.bulk) {
    xbulk.temp <- xbulk[, i]
    message(paste(colnames(xbulk)[i], "has common genes",
                  sum(xbulk[, i] != 0), "..."))
    lm <- nnls::nnls(A = basis.mvw, b = xbulk.temp)
    delta <- lm$residuals
    wt.gene <- 1/(nu + delta^2)
    x.wt <- xbulk.temp * sqrt(wt.gene)
    b.wt <- sweep(basis.mvw, 1, sqrt(wt.gene), "*")
    lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
    prop.wt <- lm.wt$x/sum(lm.wt$x)
    delta <- lm.wt$residuals
    for (iter in 1:iter.max) {
      wt.gene <- 1/(nu + delta^2)
      x.wt <- xbulk.temp * sqrt(wt.gene)
      b.wt <- sweep(basis.mvw, 1, sqrt(wt.gene), "*")
      lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
      delta.new <- lm.wt$residuals
      prop.wt.new <- lm.wt$x/sum(lm.wt$x)
      if (sum(abs(prop.wt.new - prop.wt)) < epsilon) {
        prop.wt <- prop.wt.new
        delta <- delta.new
        message("WNNLS Converged at iteration ",
                iter)
        break
      }
      prop.wt <- prop.wt.new
      delta <- delta.new
    }
    prop.est.mvw <- rbind(prop.est.mvw, prop.wt)
    yhat.temp <- basis.mvw %*% as.matrix(lm.wt$x)
    yhatgene.temp <- intersect(rownames(yhat.temp), yhatgene.temp)
    yhat <- cbind(yhat[yhatgene.temp, ], yhat.temp[yhatgene.temp,
                                                   ])
  }
  colnames(prop.est.mvw) <- colnames(basis.mvw)
  rownames(prop.est.mvw) <- colnames(bulk.eset)
  colnames(yhat) <- colnames(bulk.eset)
  yobs <- exprs(bulk.eset)
  yeval <- SCDC_yeval(y = yobs, yest = yhat, yest.names = c("SCDC"))
  peval <- NULL
  if (!is.null(truep)) {
    if (all(rownames(truep) == rownames(prop.est.mvw))){
      peval <- SCDC_peval(ptrue = truep, pest = prop.est.mvw,
                          pest.names = c("SCDC"), select.ct = ct.sub)
    } else {
      message("Your input sample names for proportion matrix and bulk.eset do not match! Please make sure sample names match.")
    }

  }
  return(list(prop.est.mvw = prop.est.mvw, basis.mvw = basis.mvw,
              yhat = yhat, yeval = yeval, peval = peval))
}

############################################
#' Tree-guided proportion estimation
#' @description Proportion estimation function for multi-subject case, and apply tree-guided deconvolution
#' @name SCDC_prop_subcl_marker
#' @param bulk.eset ExpressionSet object for bulk samples
#' @param sc.eset ExpressionSet object for single cell samples
#' @param ct.varname variable name for 'cell types'
#' @param fl.varname variable name for first-level 'meta-clusters'
#' @param sample variable name for subject/samples
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param ct.fl.sub 'cell types' for first-level 'meta-clusters'
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param weight.basis logical, use basis matrix adjusted by MVW, default is T.
#' @param select.marker logical, select marker genes to perform deconvolution in tree-guided steps. Default is T.
#' @param markers A set of marker gene that input manully to be used in deconvolution. If NULL, then
#' @param marker.varname variable name of cluster groups when selecting marker genes. If NULL, then use ct.varname.
#' @param allgenes.fl logical, use all genes in the first-level deconvolution
#' @param pseudocount.use a constant number used when selecting marker genes, default is 1.
#' @param LFC.lim a threshold of log fold change when selecting genes as input to perform Wilcoxon's test.
#' @param truep true cell-type proportions for bulk samples if known
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @param fl.cell.size default is NULL, similar to ct.cell.size. This is for first-level 'meta-clusters'.
#' @return Estimated proportion, basis matrix, predicted gene expression levels for bulk samples
#' @export
SCDC_prop_subcl_marker <- function (bulk.eset, sc.eset, ct.varname, fl.varname, sample,
                                    ct.sub = NULL, ct.fl.sub, iter.max = 3000, nu = 1e-04, epsilon = 0.001,
                                    weight.basis = T, truep = NULL, select.marker = T, markers = NULL,
                                    marker.varname = NULL, allgenes.fl = F, pseudocount.use = 1,
                                    LFC.lim = 0.5, ct.cell.size = NULL, fl.cell.size = NULL, ...)
{
  if (is.null(ct.sub)) {
    ct.sub <- unique(sc.eset@phenoData@data[, ct.varname])[!is.na(unique(sc.eset@phenoData@data[,
                                                                                                ct.varname]))]
  }
  ct.sub <- ct.sub[!is.na(ct.sub)]
  ct.fl.sub <- ct.fl.sub[!is.na(ct.fl.sub)]
  bulk.eset <- bulk.eset[rowSums(exprs(bulk.eset)) > 0, , drop = FALSE]
  sc.eset <- sc.eset[, sc.eset@phenoData@data[, ct.varname] %in%
                       ct.sub]
  sc.basis <- SCDC_basis(x = sc.eset, ct.sub = ct.sub, ct.varname = ct.varname,
                         sample = sample, ct.cell.size = ct.cell.size)
  sc.fl.basis <- SCDC_basis(x = sc.eset, ct.sub = ct.fl.sub[!is.na(ct.fl.sub)],
                            ct.varname = fl.varname, sample = sample, ct.cell.size = fl.cell.size)
  if (select.marker) {
    if (is.null(marker.varname)) {
      marker.varname <- ct.varname
    }
    countmat <- exprs(sc.eset)
    ct.group <- sc.eset@phenoData@data[, marker.varname]
    markers.wilcox <- NULL
    for (u in 1:length(unique(ct.group))) {
      ct.group.temp <- ct.group == unique(ct.group)[u]
      group.1 <- apply(X = countmat[, ct.group.temp], MARGIN = 1,
                       FUN = function(x) log(x = mean(x = expm1(x = x)) +
                                               pseudocount.use))
      group.2 <- apply(X = countmat[, !ct.group.temp],
                       MARGIN = 1, FUN = function(x) log(x = mean(x = expm1(x = x)) +
                                                           pseudocount.use))
      genes.diff <- rownames(sc.eset)[(group.1 - group.2) >
                                        LFC.lim]
      count.use <- countmat[rownames(sc.eset) %in% genes.diff,
                            ]
      p_val <- sapply(1:nrow(count.use), function(x) {
        wilcox.test(count.use[x, ] ~ ct.group.temp)$p.value
      })
      p_val_adj <- p.adjust(p = p_val, method = "bonferroni",
                            n = nrow(count.use))
      markers.temp <- rownames(count.use)[p_val_adj < 0.05]
      markers.wilcox <- c(markers.wilcox, markers.temp)
    }
    markers <- unique(markers.wilcox)
    message("Selected ", length(markers), " marker genes by Wilcoxon test...")
  }
  if (weight.basis) {
    basis <- sc.basis$basis.mvw
    basis.fl <- sc.fl.basis$basis.mvw
  }
  else {
    basis <- sc.basis$basis
    basis.fl <- sc.fl.basis$basis
  }
  if (!is.null(markers)) {
    commongenes <- Reduce(intersect, list(rownames(basis),
                                          rownames(bulk.eset), markers))
    commongenes.fl <- Reduce(intersect, list(rownames(basis.fl),
                                             rownames(bulk.eset), markers))
  }
  else {
    commongenes <- intersect(rownames(basis), rownames(bulk.eset))
    commongenes.fl <- intersect(rownames(basis.fl), rownames(bulk.eset))
    if (length(commongenes) < 0.2 * min(dim(sc.eset)[1],
                                        dim(bulk.eset)[1])) {
      stop("Too few common genes!")
    }
  }
  message(paste("Used", length(commongenes), "common genes for all cell types, \n",
                "Used", length(commongenes.fl), "common genes for first level cell types..."))
  basis.mvw <- basis[commongenes, ct.sub]
  basis.mvw.fl <- basis.fl[commongenes.fl, ct.fl.sub]
  xbulk0 <- getCPM0(exprs(bulk.eset)[commongenes, ])
  xbulk <- as.matrix(xbulk0)
  colnames(xbulk) <- colnames(bulk.eset)
  xbulk1 <- getCPM0(exprs(bulk.eset)[commongenes.fl, ])
  xbulk.fl <- as.matrix(xbulk1)
  ALS.S <- sc.basis$sum.mat[ct.sub]
  N.bulk <- ncol(bulk.eset)
  valid.ct <- (colSums(is.na(basis.mvw)) == 0) & (!is.na(ALS.S))
  ALS.S.fl <- sc.fl.basis$sum.mat[ct.fl.sub]
  valid.ct.fl <- (colSums(is.na(basis.mvw.fl)) == 0) & (!is.na(ALS.S.fl))
  if (sum(valid.ct) <= 1) {
    stop("Not enough valid cell type!")
  }
  message(paste("Used", sum(valid.ct), "cell types in deconvolution...\n",
                "Used", sum(valid.ct.fl), "first level cell types ..."))
  basis.mvw <- basis.mvw[, valid.ct]
  ALS.S <- ALS.S[valid.ct]
  basis.mvw.fl <- basis.mvw.fl[, valid.ct.fl]
  ALS.S.fl <- ALS.S[valid.ct.fl]
  prop.est <- NULL
  rsquared <- NULL
  for (i in 1:N.bulk) {
    xbulk.temp <- xbulk[, i]
    message(paste(colnames(xbulk)[i], "has common genes",
                  sum(xbulk[, i] != 0), "..."))
    if (allgenes.fl) {
      markers.fl <- names(xbulk.temp)
    }
    else {
      markers.fl <- Reduce(intersect, list(markers, names(xbulk.temp)))
    }
    lm <- nnls::nnls(A = basis.mvw.fl[markers.fl, ], b = xbulk.temp[markers.fl])
    delta <- lm$residuals
    wt.gene <- 1/(nu + delta^2)
    x.wt <- xbulk.temp[markers.fl] * sqrt(wt.gene)
    b.wt <- sweep(basis.mvw.fl[markers.fl, ], 1, sqrt(wt.gene),
                  "*")
    lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
    prop.wt.fl <- lm.wt$x/sum(lm.wt$x)
    delta <- lm.wt$residuals
    for (iter in 1:iter.max) {
      wt.gene <- 1/(nu + delta^2)
      x.wt <- xbulk.temp[markers.fl] * sqrt(wt.gene)
      b.wt <- sweep(basis.mvw.fl[markers.fl, ], 1, sqrt(wt.gene),
                    "*")
      lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
      delta.new <- lm.wt$residuals
      prop.wt.fl.new <- lm.wt$x/sum(lm.wt$x)
      if (sum(abs(prop.wt.fl.new - prop.wt.fl)) < epsilon) {
        prop.wt.fl <- prop.wt.fl.new
        delta <- delta.new
        message("WNNLS for First level clusters Converged at iteration ",
                iter)
        break
      }
      prop.wt.fl <- prop.wt.fl.new
      delta <- delta.new
    }
    names(prop.wt.fl) <- colnames(basis.mvw.fl)
    rt <- table(sc.eset@phenoData@data[, ct.varname], sc.eset@phenoData@data[,
                                                                             fl.varname])
    rt <- rt[, ct.fl.sub]
    rt.list <- list()
    prop.wt <- NULL
    for (j in 1:ncol(rt)) {
      rt.list[[j]] <- rownames(rt)[rt[, j] > 0]
      names(rt.list)[j] <- colnames(rt)[j]
      sub.cl <- rownames(rt)[rt[, j] > 0]
      if (length(sub.cl) > 1 & prop.wt.fl[colnames(rt)[j]] >
          0) {
        if (is.null(dim(prop.wt.fl))) {
          xbulk.j <- basis.mvw.fl[, j] * prop.wt.fl[j] +
            (xbulk.temp - basis.mvw.fl %*% lm.wt$x) *
            prop.wt.fl[j]
        }
        else {
          xbulk.j <- basis.mvw.fl[, j] * prop.wt.fl[,
                                                    j] + (xbulk.temp - basis.mvw.fl %*% lm.wt$x) *
            prop.wt.fl[, j]
        }
        markers.sl <- Reduce(intersect, list(markers,
                                             rownames(xbulk.j)))
        basis.sl <- basis.mvw[markers.sl, rownames(rt)[rt[,
                                                          j] > 0]]
        lm.sl <- nnls::nnls(A = basis.sl, b = xbulk.j[markers.sl,
                                                      ])
        delta.sl <- lm.sl$residuals
        wt.gene.sl <- 1/(nu + delta.sl^2)
        x.wt.sl <- xbulk.j[markers.sl, ] * sqrt(wt.gene.sl)
        b.wt.sl <- sweep(basis.sl, 1, sqrt(wt.gene.sl),
                         "*")
        lm.wt.sl <- nnls::nnls(A = b.wt.sl, b = x.wt.sl)
        prop.wt.sl <- lm.wt.sl$x/sum(lm.wt.sl$x)
        delta.sl <- lm.wt.sl$residuals
        for (iter in 1:iter.max) {
          wt.gene.sl <- 1/(nu + delta.sl^2)
          x.wt.sl <- xbulk.j[markers.sl, ] * sqrt(wt.gene.sl)
          b.wt.sl <- sweep(basis.sl, 1, sqrt(wt.gene.sl),
                           "*")
          lm.wt.sl <- nnls::nnls(A = b.wt.sl, b = x.wt.sl)
          delta.sl.new <- lm.wt.sl$residuals
          prop.wt.sl.new <- lm.wt.sl$x/sum(lm.wt.sl$x)
          if (sum(abs(prop.wt.sl.new - prop.wt.sl)) <
              epsilon) {
            prop.wt.sl <- prop.wt.sl.new
            delta.sl <- delta.sl.new
            cat("WNNLS for Second level clusters",
                rownames(rt)[rt[, j] > 0], "Converged at iteration ",
                iter)
            break
          }
          prop.wt.sl <- prop.wt.sl.new
          delta.sl <- delta.sl.new
        }
        names(prop.wt.sl) <- sub.cl
        prop.wt <- c(prop.wt, prop.wt.sl * prop.wt.fl[colnames(rt)[j]])
      }
      else if (length(sub.cl) == 1) {
        temp <- prop.wt.fl[colnames(rt)[j]]
        names(temp) <- rt.list[[j]]
        prop.wt <- c(prop.wt, temp)
      }
      else if (length(sub.cl) > 1 & prop.wt.fl[colnames(rt)[j]] ==
               0) {
        prop.wt.sl <- rep(0, length(sub.cl))
        names(prop.wt.sl) <- sub.cl
        prop.wt <- c(prop.wt, prop.wt.sl)
      }
    }
    prop.est <- rbind(prop.est, prop.wt)
  }
  rownames(prop.est) <- colnames(bulk.eset)
  peval <- NULL
  if (!is.null(truep)) {
    peval <- SCDC_peval(ptrue = truep, pest = prop.est, pest.names = c("SCDC"),
                        select.ct = ct.sub)
  }

  # calculate yhat after deconv
  yhat <- sc.basis$basis.mvw %*% t(prop.est)[colnames(sc.basis$basis.mvw),]

  return(list(prop.est = prop.est, prop.wt.fl = prop.wt.fl,
              basis.mvw = basis.mvw, peval = peval, sc.basis = sc.basis,
              sc.fl.basis = sc.fl.basis, yhat = yhat))
}


############################################
#' Tree-guided proportion estimation for ONE subject
#' @description Proportion estimation function for ONE-subject case, and apply tree-guided deconvolution
#' @name SCDC_prop_ONE_subcl_marker
#' @param bulk.eset ExpressionSet object for bulk samples
#' @param sc.eset ExpressionSet object for single cell samples
#' @param ct.varname variable name for 'cell types'
#' @param fl.varname variable name for first-level 'meta-clusters'
#' @param sample variable name for subject/samples
#' @param ct.sub a subset of cell types that are selected to construct basis matrix
#' @param ct.fl.sub 'cell types' for first-level 'meta-clusters'
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param weight.basis logical, use basis matrix adjusted by MVW, default is T.
#' @param select.marker logical, select marker genes to perform deconvolution in tree-guided steps. Default is T.
#' @param markers A set of marker gene that input manully to be used in deconvolution. If NULL, then
#' @param marker.varname variable name of cluster groups when selecting marker genes. If NULL, then use ct.varname.
#' @param allgenes.fl logical, use all genes in the first-level deconvolution
#' @param pseudocount.use a constant number used when selecting marker genes, default is 1.
#' @param LFC.lim a threshold of log fold change when selecting genes as input to perform Wilcoxon's test.
#' @param truep true cell-type proportions for bulk samples if known
#' @param ct.cell.size default is NULL, which means the "library size" is calculated based on the data. Users can specify a vector of cell size factors corresponding to the ct.sub according to prior knowledge. The vector should be named: names(ct.cell.size input) should not be NULL.
#' @param fl.cell.size default is NULL, similar to ct.cell.size. This is for first-level 'meta-clusters'.
#' @return Estimated proportion, basis matrix, predicted gene expression levels for bulk samples
#' @export
SCDC_prop_ONE_subcl_marker <- function(bulk.eset, sc.eset, ct.varname, fl.varname, sample, truep = NULL,
                                       ct.sub = NULL, ct.fl.sub, iter.max = 3000, nu = 1e-04, epsilon = 0.001,
                                       weight.basis = F, bulk_disease = NULL, select.marker = T, markers = NULL, marker.varname = NULL,
                                       pseudocount.use = 1, LFC.lim = 0.5, allgenes.fl = F, ct.cell.size = NULL, fl.cell.size = NULL,
                                       ...)
{

  if (is.null(ct.sub)){
    ct.sub <- unique(sc.eset@phenoData@data[,ct.varname])[!is.na(unique(sc.eset@phenoData@data[,ct.varname]))]
  }
  ct.sub <- ct.sub[!is.na(ct.sub)]
  ct.fl.sub <- ct.fl.sub[!is.na(ct.fl.sub)]
  bulk.eset <- bulk.eset[rowSums(exprs(bulk.eset))>0, , drop = FALSE]
  sc.basis <- SCDC_basis_ONE(x = sc.eset, ct.sub = ct.sub, ct.varname = ct.varname, sample = sample, ct.cell.size = ct.cell.size)
  sc.fl.basis <- SCDC_basis_ONE(x = sc.eset, ct.sub = ct.fl.sub[!is.na(ct.fl.sub)],
                                ct.varname = fl.varname, sample = sample, ct.cell.size = fl.cell.size)
  if (select.marker){
    if (is.null(marker.varname)){
      marker.varname <- ct.varname
    }
    # wilcox test on two groups of cells for marker gene selection... (refer to seurat::FindMarkers)
    countmat <- exprs(sc.eset)
    ct.group <- sc.eset@phenoData@data[,marker.varname]
    markers.wilcox <- NULL
    # u=1
    for(u in 1:length(unique(ct.group))){
      ct.group.temp <- ct.group == unique(ct.group)[u]
      group.1 <- apply(X = countmat[,ct.group.temp],
                       MARGIN = 1, FUN = function(x) log(x = mean(x = expm1(x = x)) +
                                                           pseudocount.use))
      group.2 <- apply(X = countmat[,! ct.group.temp],
                       MARGIN = 1, FUN = function(x) log(x = mean(x = expm1(x = x)) +
                                                           pseudocount.use))
      genes.diff <- rownames(sc.eset)[(group.1 - group.2) > LFC.lim]
      count.use <- countmat[rownames(sc.eset) %in% genes.diff,]

      ##
      p_val <- sapply(1:nrow(count.use), function(x){
        wilcox.test(count.use[x,] ~ ct.group.temp)$p.value
      })
      p_val_adj <- p.adjust(p = p_val, method = "bonferroni",
                            n = nrow(count.use))
      markers.temp <- rownames(count.use)[p_val_adj < 0.05]
      markers.wilcox <- c(markers.wilcox, markers.temp)
    }
    markers <- unique(markers.wilcox)
    message("Selected ",length(markers), " marker genes by Wilcoxon test...")
  } # else need input of marker genes for clustering

  # match genes / cells first
  if (weight.basis){
    basis <- sc.basis$basis.mvw
    basis.fl <- sc.fl.basis$basis.mvw
  } else {
    basis <- sc.basis$basis
    basis.fl <- sc.fl.basis$basis
  }
  if (!is.null(markers)){
    commongenes <- Reduce(intersect, list(rownames(basis), rownames(bulk.eset), markers))
    commongenes.fl <- Reduce(intersect, list(rownames(basis.fl), rownames(bulk.eset), markers))
  } else {
    commongenes <- intersect(rownames(basis), rownames(bulk.eset))
    commongenes.fl <- intersect(rownames(basis.fl), rownames(bulk.eset))
    # stop when few common genes exist...
    if (length(commongenes) < 0.2 * min(dim(sc.eset)[1], dim(bulk.eset)[1])){
      stop('Too few common genes!')
    }
  }

  message(paste("Used", length(commongenes), "common genes for all cell types, \n",
                "Used", length(commongenes.fl), "common genes for first level cell types..."))

  basis.mvw <- basis[commongenes, ct.sub]
  basis.mvw.fl <- basis.fl[commongenes.fl, ct.fl.sub]

  xbulk0 <- getCPM0(exprs(bulk.eset)[commongenes,])
  xbulk <- as.matrix(xbulk0) ## whether to normalize all /common genes
  colnames(xbulk) <- colnames(bulk.eset)
  xbulk1 <- getCPM0(exprs(bulk.eset)[commongenes.fl,])
  xbulk.fl <- as.matrix(xbulk1)

  ALS.S <- sc.basis$sum.mat[ct.sub]
  N.bulk <- ncol(bulk.eset)
  valid.ct <- (colSums(is.na(basis.mvw)) == 0) & (!is.na(ALS.S))

  ALS.S.fl <- sc.fl.basis$sum.mat[ct.fl.sub]
  valid.ct.fl <- (colSums(is.na(basis.mvw.fl)) == 0) & (!is.na(ALS.S.fl))

  if (sum(valid.ct) <= 1) {
    stop("Not enough valid cell type!")
  }
  message(paste("Used", sum(valid.ct), "cell types in deconvolution...\n",
                "Used", sum(valid.ct.fl),"first level cell types ..."))

  basis.mvw <- basis.mvw[, valid.ct]
  ALS.S <- ALS.S[valid.ct]

  basis.mvw.fl <- basis.mvw.fl[, valid.ct.fl]
  ALS.S.fl <- ALS.S[valid.ct.fl]

  prop.est <- NULL
  rsquared <- NULL

  # prop estimation for each bulk sample:
  for (i in 1:N.bulk) {
    # i=1
    xbulk.temp <- xbulk[, i] *1e3 ## will affect a little bit
    message(paste(colnames(xbulk)[i], "has common genes", sum(xbulk[, i] != 0), "..."))
    if (allgenes.fl){
      markers.fl <- names(xbulk.temp)
    } else {
      markers.fl <- Reduce(intersect, list(markers, names(xbulk.temp)))
    }

    # first level NNLS:
    lm <- nnls::nnls(A=basis.mvw.fl[markers.fl,],b=xbulk.temp[markers.fl])
    delta <- lm$residuals
    wt.gene <- 1/(nu + delta^2)
    x.wt <- xbulk.temp[markers.fl] *sqrt(wt.gene)
    b.wt <- sweep(basis.mvw.fl[markers.fl,],1,sqrt(wt.gene),"*")

    lm.wt <- nnls::nnls(A=b.wt, b=x.wt)
    prop.wt.fl <- lm.wt$x/sum(lm.wt$x)
    delta <- lm.wt$residuals

    for (iter in 1:iter.max){
      wt.gene <- 1/(nu + delta^2)
      x.wt <- xbulk.temp[markers.fl] * sqrt(wt.gene)
      b.wt <- sweep(basis.mvw.fl[markers.fl,],1,sqrt(wt.gene),"*")
      lm.wt <- nnls::nnls(A=b.wt, b=x.wt)
      delta.new <- lm.wt$residuals
      prop.wt.fl.new <- lm.wt$x/sum(lm.wt$x)

      if (sum(abs(prop.wt.fl.new - prop.wt.fl)) < epsilon){
        prop.wt.fl <- prop.wt.fl.new
        delta <- delta.new
        message("WNNLS for First level clusters Converged at iteration ", iter)
        break
      }
      prop.wt.fl <- prop.wt.fl.new
      delta <- delta.new
    }
    names(prop.wt.fl) <- colnames(basis.mvw.fl)

    # relationship between first level and overall
    rt <- table(sc.eset@phenoData@data[,ct.varname], sc.eset@phenoData@data[,fl.varname])
    rt <- rt[,ct.fl.sub]
    rt.list <- list()
    prop.wt <- NULL

    # prop.wt
    for (j in 1:ncol(rt)){ # for each first level cluster
      # j=1
      rt.list[[j]] <- rownames(rt)[rt[,j] >0]
      names(rt.list)[j] <- colnames(rt)[j]
      sub.cl <- rownames(rt)[rt[,j] >0]
      if (length(sub.cl) > 1 & prop.wt.fl[colnames(rt)[j]] > 0) {
        if (is.null(dim(prop.wt.fl))){
          # specify genes in xbulk.j??? first level genes?
          xbulk.j <- basis.mvw.fl[,j]*prop.wt.fl[j] + (xbulk.temp - basis.mvw.fl %*% lm.wt$x)*prop.wt.fl[j]
        } else {
          xbulk.j <- basis.mvw.fl[,j]*prop.wt.fl[,j] + (xbulk.temp - basis.mvw.fl %*% lm.wt$x)*prop.wt.fl[,j]
        }

        markers.sl <- Reduce(intersect, list(markers, rownames(xbulk.j)))

        ##############################################################################
        # make markers.sub as a list, for each of the first-level intra clusters.
        ##############################################################################

        basis.sl <- basis.mvw[markers.sl,rownames(rt)[rt[,j] >0]]
        lm.sl <- nnls::nnls(A=basis.sl,b=xbulk.j[markers.sl,])
        delta.sl <- lm.sl$residuals
        wt.gene.sl <- 1/(nu + delta.sl^2)
        x.wt.sl <- xbulk.j[markers.sl,]*sqrt(wt.gene.sl)
        b.wt.sl <- sweep(basis.sl,1,sqrt(wt.gene.sl),"*")

        lm.wt.sl <- nnls::nnls(A=b.wt.sl, b=x.wt.sl)
        prop.wt.sl <- lm.wt.sl$x/sum(lm.wt.sl$x)
        delta.sl <- lm.wt.sl$residuals

        for (iter in 1:iter.max){
          wt.gene.sl <- 1/(nu + delta.sl^2)
          x.wt.sl <- xbulk.j[markers.sl,] * sqrt(wt.gene.sl)
          b.wt.sl <- sweep(basis.sl,1,sqrt(wt.gene.sl),"*")
          lm.wt.sl <- nnls::nnls(A=b.wt.sl, b=x.wt.sl)
          delta.sl.new <- lm.wt.sl$residuals
          prop.wt.sl.new <- lm.wt.sl$x/sum(lm.wt.sl$x)

          if (sum(abs(prop.wt.sl.new - prop.wt.sl)) < epsilon){
            prop.wt.sl <- prop.wt.sl.new
            delta.sl <- delta.sl.new
            cat("WNNLS for Second level clusters",rownames(rt)[rt[,j] >0],"Converged at iteration ", iter)
            break
          }
          prop.wt.sl <- prop.wt.sl.new
          delta.sl <- delta.sl.new
        }
        names(prop.wt.sl) <- sub.cl
        prop.wt <- c(prop.wt, prop.wt.sl*prop.wt.fl[colnames(rt)[j]])
      } else if (length(sub.cl) == 1){
        # j=2
        # prop.wt <- c(prop.wt, prop.wt.fl[colnames(rt)[j]])
        temp <- prop.wt.fl[colnames(rt)[j]]
        names(temp) <- rt.list[[j]]
        prop.wt <- c(prop.wt, temp)
      } else if (length(sub.cl) > 1 & prop.wt.fl[colnames(rt)[j]] == 0){
        prop.wt.sl <- rep(0, length(sub.cl))
        names(prop.wt.sl) <- sub.cl
        prop.wt <- c(prop.wt, prop.wt.sl)
      }

    }
    prop.est <- rbind(prop.est, prop.wt)
  }
  rownames(prop.est) <- colnames(bulk.eset)

  peval <- NULL
  if (!is.null(truep)){
    peval <- SCDC_eval(ptrue= truep, pest = prop.est, pest.names = c('SCDC'),
                       dtname = 'Perou', select.ct = ct.sub, bulk_obj = bulk.eset,
                       bulk_disease = bulk_disease)
  }

  return(list(prop.est = prop.est, prop.wt.fl = prop.wt.fl, basis.mvw = basis.mvw, peval = peval,
              sc.basis = sc.basis, sc.fl.basis = sc.fl.basis))
}


##############################################
#' Simple deconvolution function using W-NNLS
#' @description Deconvolution using pre-normalized count matrix and pre-calculated basis matrix.
#' @name deconv_simple
#' @param count.filter.norm Normalized count matrix for bulk samples. Genes by samples.
#' @param basis.norm Basis matrix calculated from single cell samples. Genes by cell-types.
#' @param iter.max the maximum number of iteration in WNNLS
#' @param nu a small constant to facilitate the calculation of variance
#' @param epsilon a small constant number used for convergence criteria
#' @param truep true cell-type proportions for bulk samples if known
#' @return Estimated proportion-prop.est.mvw for bulk samples
#' @export
deconv_simple <- function(count.filter.norm, basis.norm, iter.max = 2000, nu = 1e-10, epsilon = 0.001,
                          truep = NULL){
  ALS.S <- rep(1, ncol(basis.norm))
  N.bulk <- ncol(count.filter.norm)
  common.gene <- intersect(rownames(count.filter.norm), rownames(basis.norm))
  count.filter.norm <- count.filter.norm[common.gene,]
  basis.norm <- basis.norm[common.gene,]
  prop.est.mvw <- NULL
  yhat <- NULL
  yhatgene.temp <- rownames(basis.norm)
  for (i in 1:N.bulk) {
    xbulk.temp <- count.filter.norm[, i]
    message(paste(colnames(count.filter.norm)[i], "has common genes",
                  sum(count.filter.norm[, i] != 0), "..."))
    lm <- nnls::nnls(A = basis.norm, b = xbulk.temp)
    delta <- lm$residuals
    wt.gene <- 1/(nu + delta^2)
    x.wt <- xbulk.temp * sqrt(wt.gene)
    b.wt <- sweep(basis.norm, 1, sqrt(wt.gene), "*")
    lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
    prop.wt <- lm.wt$x/sum(lm.wt$x)
    delta <- lm.wt$residuals
    for (iter in 1:iter.max) {
      wt.gene <- 1/(nu + delta^2)
      x.wt <- xbulk.temp * sqrt(wt.gene)
      b.wt <- sweep(basis.norm, 1, sqrt(wt.gene), "*")
      lm.wt <- nnls::nnls(A = b.wt, b = x.wt)
      delta.new <- lm.wt$residuals
      prop.wt.new <- lm.wt$x/sum(lm.wt$x)
      if (sum(abs(prop.wt.new - prop.wt)) < epsilon) {
        prop.wt <- prop.wt.new
        delta <- delta.new
        message("WNNLS Converged at iteration ",
                iter)
        break
      }
      prop.wt <- prop.wt.new
      delta <- delta.new
    }
    prop.est.mvw <- rbind(prop.est.mvw, prop.wt)
    yhat.temp <- basis.norm %*% as.matrix(lm.wt$x)
    yhatgene.temp <- intersect(rownames(yhat.temp), yhatgene.temp)
    yhat <- cbind(yhat[yhatgene.temp, ], yhat.temp[yhatgene.temp,])
  }
  colnames(prop.est.mvw) <- colnames(basis.norm)
  rownames(prop.est.mvw) <- colnames(count.filter.norm)
  return(list(prop.est.mvw = prop.est.mvw))
}

# res = deconv_simple(count.filter.norm, basis.norm)
# res$prop.est.mvw
meichendong/SCDC documentation built on Jan. 26, 2023, 5:08 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
meichendong/SCDC
Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets

R/Deconvolution.R
In meichendong/SCDC: Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets

Defines functions deconv_simple SCDC_prop_ONE_subcl_marker SCDC_qc_ONE SCDC_basis_ONE SCDC_basis

Documented in deconv_simple SCDC_basis SCDC_basis_ONE SCDC_prop_ONE_subcl_marker SCDC_qc_ONE

R Package Documentation

Browse R Packages

We want your feedback!

meichendong/SCDC Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets

R/Deconvolution.R In meichendong/SCDC: Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets

Defines functions deconv_simple SCDC_prop_ONE_subcl_marker SCDC_qc_ONE SCDC_basis_ONE SCDC_basis

Documented in deconv_simple SCDC_basis SCDC_basis_ONE SCDC_prop_ONE_subcl_marker SCDC_qc_ONE

R Package Documentation

Browse R Packages

We want your feedback!

meichendong/SCDC
Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets

R/Deconvolution.R
In meichendong/SCDC: Bulk RNA-seq deconvolution by scRNA-seq with multi-reference datasets