R/DWLS.R

Defines functions Dwls buildSignatureMatrixMAST DEAnalysisMAST m.auc v.auc stat.log2 Mean.in.log2space buildSignatureMatrixUsingSeurat DEAnalysis solveSVR findDampeningConstant solveDampenedWLSj solveDampenedWLS solveOLSInternal solveOLS trimData

Documented in Dwls

# trim bulk and single-cell data to contain the same genes
trimData <- function(Signature, bulkdata) {
  Genes <- intersect(rownames(Signature), names(bulkdata))
  B <- bulkdata[Genes]
  S <- Signature[Genes, ]
  return(list("sig" = S, "bulk" = B))
}

# solve using OLS, constrained such that cell type numbers>0
solveOLS <- function(S, B) {
  D <- t(S) %*% S
  d <- t(S) %*% B
  A <- cbind(diag(dim(S)[2]))
  bzero <- c(rep(0, dim(S)[2]))
  solution <- quadprog::solve.QP(D, d, A, bzero)$solution
  names(solution) <- colnames(S)
  print(round(solution / sum(solution), 5))
  return(solution / sum(solution))
}

# return cell number, not proportion
# do not print output
solveOLSInternal <- function(S, B) {
  D <- t(S) %*% S
  d <- t(S) %*% B
  A <- cbind(diag(dim(S)[2]))
  bzero <- c(rep(0, dim(S)[2]))
  solution <- quadprog::solve.QP(D, d, A, bzero)$solution
  names(solution) <- colnames(S)
  return(solution)
}

# solve using WLS with weights dampened by a certain dampening constant
solveDampenedWLS <- function(S, B) {
  # first solve OLS, use this solution to find a starting point for the weights
  solution <- solveOLSInternal(S, B)
  # now use dampened WLS, iterate weights until convergence
  iterations <- 0
  changes <- c()
  # find dampening constant for weights using cross-validation
  j <- findDampeningConstant(S, B, solution)
  change <- 1
  while (change > .01 & iterations < 1000) {
    newsolution <- solveDampenedWLSj(S, B, solution, j)
    # decrease step size for convergence
    solutionAverage <- rowMeans(cbind(newsolution,
                                      matrix(solution,
                                             nrow = length(solution),
                                             ncol = 4)))
    change <- norm(as.matrix(solutionAverage - solution))
    solution <- solutionAverage
    iterations <- iterations + 1
    changes <- c(changes, change)
  }
  print(round(solution / sum(solution), 5))
  return(solution / sum(solution))
}

# solve WLS given a dampening constant
solveDampenedWLSj <- function(S, B, goldStandard, j) {
  multiplier <- 1 * 2 ^ (j - 1)
  sol <- goldStandard
  ws <- as.vector((1 / (S %*% sol)) ^ 2)
  wsScaled <- ws / min(ws)
  wsDampened <- wsScaled
  wsDampened[which(wsScaled > multiplier)] <- multiplier
  W <- diag(wsDampened)
  D <- t(S) %*% W %*% S
  d <- t(S) %*% W %*% B
  A <- cbind(diag(dim(S)[2]))
  bzero <- c(rep(0, dim(S)[2]))
  sc <- norm(D, "2")
  solution <- quadprog::solve.QP(D / sc, d / sc, A, bzero)$solution
  names(solution) <- colnames(S)
  return(solution)
}

# find a dampening constant for the weights using cross-validation
findDampeningConstant <- function(S, B, goldStandard) {
  solutionsSd <- NULL
  # goldStandard is used to define the weights
  sol <- goldStandard
  ws <- as.vector((1 / (S %*% sol)) ^ 2)
  wsScaled <- ws / min(ws)
  wsScaledMinusInf <- wsScaled
  # ignore infinite weights
  if (max(wsScaled) == "Inf") {
    wsScaledMinusInf <- wsScaled[-which(wsScaled == "Inf")]
  }
  # try multiple values of the dampening constant (multiplier)
  # for each, calculate the variance of the dampened weighted solution for a subset of genes
  for (j in 1:ceiling(log2(max(wsScaledMinusInf)))) {
    multiplier <- 1 * 2 ^ (j - 1)
    wsDampened <- wsScaled
    wsDampened[which(wsScaled > multiplier)] <- multiplier
    solutions <- NULL
    seeds <- c(1:100)
    for (i in 1:100) {
      set.seed(seeds[i]) # make nondeterministic
      subset <- sample(length(ws), size = length(ws) * 0.5) # randomly select half of gene set
      # solve dampened weighted least squares for subset
      fit = lm (B[subset] ~ -1 + S[subset, ], weights = wsDampened[subset])
      sol <- fit$coef * sum(goldStandard) / sum(fit$coef)
      solutions <- cbind(solutions, sol)
    }
    solutionsSd <- cbind(solutionsSd, apply(solutions, 1, sd))
  }
  # choose dampening constant that results in least cross-validation variance
  j <- which.min(colMeans(solutionsSd ^ 2))
  return(j)
}

solveSVR <- function(S, B) {
  # scaling
  ub = max(c(as.vector(S), B)) # upper bound
  lb = min(c(as.vector(S), B)) # lower bound
  Bs = ((B - lb) / ub) * 2 - 1
  Ss = ((S - lb) / ub) * 2 - 1

  # perform SVR
  model <-
    svm(
      Ss,
      Bs,
      nu = 0.5,
      scale = TRUE,
      type = "nu-regression",
      kernel = "linear",
      cost = 1
    )
  coef <- t(model$coefs) %*% model$SV
  coef[which(coef < 0)] <- 0
  coef <- as.vector(coef)
  names(coef) <- colnames(S)
  print(round(coef / sum(coef), 5))
  return(coef / sum(coef))
}

# perform DE analysis using Seurat
DEAnalysis <- function(scdata, id, path) {
  exprObj <- Seurat::CreateSeuratObject(raw.data = as.data.frame(scdata), project = "DE")
  exprObj2 <- Seurat::SetIdent(exprObj, ident.use = as.vector(id))
  print("Calculating differentially expressed genes:")
  for (i in unique(id)) {
    de_group <-
      Seurat::FindMarkers(
              object = exprObj2,
              ident.1 = i,
              ident.2 = NULL,
              only.pos = TRUE,
              test.use = "bimod")
    save(de_group, file = paste(path, "/de_", i, ".RData", sep = ""))
  }
}

# build signature matrix using genes identified by DEAnalysis()

buildSignatureMatrixUsingSeurat <-
  function(scdata,
           id,
           path,
           diff.cutoff = 0.5,
           pval.cutoff = 0.01) {
    # perform differential expression analysis
    DEAnalysis(scdata, id, path)

    numberofGenes <- c()
    for (i in unique(id)) {
      load(file = paste(path, "/de_", i, ".RData", sep = ""))
      DEGenes <-
        rownames(de_group)[intersect(which(de_group$p_val_adj < pval.cutoff),
                                            which(de_group$avg_logFC > diff.cutoff))]
      nonMir = grep("MIR|Mir", DEGenes, invert = T)
      assign(paste("cluster_lrTest.table.", i, sep = ""), de_group[which(rownames(de_group) %in%
                                                                           DEGenes[nonMir]), ])
      numberofGenes <- c(numberofGenes, length(DEGenes[nonMir]))
    }

    # need to reduce number of genes
    # for each subset, order significant genes by decreasing fold change, choose between 50 and 200 genes
    # choose matrix with lowest condition number
    conditionNumbers <- c()
    for (G in 50:200) {
      Genes <- c()
      j = 1
      for (i in unique(id)) {
        if (numberofGenes[j] > 0) {
          temp <- paste("cluster_lrTest.table.", i, sep = "")
          temp <- as.name(temp)
          temp <- eval(parse(text = temp))
          temp <- temp[order(temp$p_val_adj, decreasing = TRUE), ]
          Genes <- c(Genes, (rownames(temp)[1:min(G, numberofGenes[j])]))
        }
        j = j + 1
      }
      Genes <- unique(Genes)
      # make signature matrix
      ExprSubset <- scdata[Genes, ]
      Sig <- NULL
      for (i in unique(id)) {
        Sig <-
          cbind(Sig, (apply(ExprSubset, 1, function(y) mean(y[which(id == i)]))))}
      colnames(Sig) <- unique(id)
      conditionNumbers <- c(conditionNumbers, kappa(Sig))
    }
    G <- which.min(conditionNumbers) + min(49, numberofGenes - 1) # G is optimal gene number
    #
    Genes <- c()
    j <- 1
    for (i in unique(id)) {
      if (numberofGenes[j] > 0) {
        temp <- paste("cluster_lrTest.table.", i, sep = "")
        temp <- as.name(temp)
        temp <- eval(parse(text = temp))
        temp <- temp[order(temp$p_val_adj, decreasing = TRUE), ]
        Genes <- c(Genes, (rownames(temp)[1:min(G, numberofGenes[j])]))
      }
      j = j + 1
    }
    Genes <- unique(Genes)
    ExprSubset <- scdata[Genes, ]
    Sig <- NULL
    for (i in unique(id)) {
      Sig <- cbind(Sig, (apply(ExprSubset, 1, function(y) mean(y[which(id == i)]))))}
    colnames(Sig) <- unique(id)
    save(Sig, file = paste(path, "/Sig.RData", sep = ""))
    return(Sig)
  }

## alternative differential expression method using MAST

# functions for DE

Mean.in.log2space <- function(x, pseudo.count) {
  return(log2(mean(2 ^ (x) - pseudo.count) + pseudo.count))
}

stat.log2 <- function(data.m, group.v, pseudo.count) {
  # data.m = data.used.log2
  log2.mean.r <- aggregate(t(data.m), list(as.character(group.v)), function(x){
    Mean.in.log2space(x, pseudo.count)
  })
  log2.mean.r <- t(log2.mean.r)
  colnames(log2.mean.r) <- paste("mean.group", log2.mean.r[1, ], sep = "")
  log2.mean.r <- log2.mean.r[-1, ]
  log2.mean.r <- as.data.frame(log2.mean.r)
  log2.mean.r <- varhandle::unfactor(log2.mean.r)  # from varhandle
  log2.mean.r[, 1] <- as.numeric(log2.mean.r[, 1])
  log2.mean.r[, 2] <- as.numeric(log2.mean.r[, 2])
  log2_foldchange <- log2.mean.r$mean.group1 - log2.mean.r$mean.group0
  results <- data.frame(cbind(
    log2.mean.r$mean.group0,
    log2.mean.r$mean.group1,
    log2_foldchange
  ))
  colnames(results) <- c("log2.mean.group0", "log2.mean.group1", "log2_fc")
  rownames(results) <- rownames(log2.mean.r)
  return(results)
}

v.auc <- function(data.v, group.v) {
  prediction.use <- ROCR::prediction(data.v, group.v, 0:1)
  perf.use <- ROCR::performance(prediction.use, "auc")
  auc.use <- round(perf.use@y.values[[1]], 3)
  return(auc.use)
}
m.auc <- function(data.m, group.v) {
  AUC <- apply(data.m, 1, function(x) v.auc(x, group.v))
  AUC[is.na(AUC)] <- 0.5
  return(AUC)

}

# perform DE analysis using MAST
DEAnalysisMAST <- function(scdata, id, path) {
  pseudo.count <- 0.1
  data.used.log2 <- log2(scdata + pseudo.count)
  colnames(data.used.log2) <- make.unique(colnames(data.used.log2))
  diff.cutoff = 0.5
  for (i in unique(id)) {
    cells.symbol.list2 <- colnames(data.used.log2)[which(id == i)]
    cells.coord.list2 <- match(cells.symbol.list2, colnames(data.used.log2))
    cells.symbol.list1 <- colnames(data.used.log2)[which(id != i)]
    cells.coord.list1 <- match(cells.symbol.list1, colnames(data.used.log2))
    data.used.log2.ordered <- cbind(data.used.log2[, cells.coord.list1],
                                    data.used.log2[, cells.coord.list2])
    group.v <- c(rep(0, length(cells.coord.list1)),
                 rep(1, length(cells.coord.list2)))
    # ouput
    log2.stat.result <- stat.log2(data.used.log2.ordered, group.v, pseudo.count)
    Auc <- m.auc(data.used.log2.ordered, group.v)
    bigtable <- data.frame(cbind(log2.stat.result, Auc))
    DE <- bigtable[bigtable$log2_fc > diff.cutoff, ]
    dim(DE)
    if (dim(DE)[1] > 1) {
      data.1 <- data.used.log2[, cells.coord.list1]
      data.2 <- data.used.log2[, cells.coord.list2]
      genes.list <- rownames(DE)
      log2fold_change <- cbind(genes.list, DE$log2_fc)
      colnames(log2fold_change) <- c("gene.name", "log2fold_change")
      counts <- as.data.frame(cbind(data.1[genes.list, ], data.2[genes.list, ]))
      groups <- c(rep("Cluster_Other", length(cells.coord.list1)), rep(i, length(cells.coord.list2)))
      groups <- as.character(groups)
      data_for_MIST <-
        as.data.frame(cbind(
          rep(rownames(counts), dim(counts)[2]),
          reshape::melt(counts),
          rep(groups, each = dim(counts)[1]),
          rep(1, dim(counts)[1] * dim(counts)[2])
        ))
      colnames(data_for_MIST) <- c("Gene", "Subject.ID", "Et", "Population", "Number.of.Cells")
      vbeta <- data_for_MIST
      vbeta.fa <- MAST::FromFlatDF(
        vbeta,
        idvars <- c("Subject.ID"),
        primerid = 'Gene',
        measurement = 'Et',
        ncells = 'Number.of.Cells',
        geneid = "Gene",
        cellvars = c('Number.of.Cells', 'Population'),
        phenovars = c('Population'),
        id = 'vbeta all'
      )
      vbeta.1 <- subset(vbeta.fa, Number.of.Cells == 1)
      # .3 MAST
      head(SummarizedExperiment::colData(vbeta.1))
      zlm.output <-
        MAST::zlm(~ Population,
                  vbeta.1,
                  method = 'bayesglm',
                  ebayes = TRUE)
      show(zlm.output)
      coefAndCI <- summary(zlm.output, logFC = TRUE)
      zlm.lr <- MAST::lrTest(zlm.output, 'Population')
      zlm.lr_pvalue <- reshape::melt(zlm.lr[, , 'Pr(>Chisq)'])
      zlm.lr_pvalue <- zlm.lr_pvalue[which(zlm.lr_pvalue$test.type == 'hurdle'), ]

      lrTest.table <- merge(zlm.lr_pvalue, DE, by.x = "primerid", by.y = "row.names")
      colnames(lrTest.table) <- c("Gene",
                                  "test.type",
                                  "p_value",
                                  paste("log2.mean.", "Cluster_Other", sep = ""),
                                  paste("log2.mean.", i, sep = ""),
                                  "log2fold_change",
                                  "Auc")
      cluster_lrTest.table <- lrTest.table[rev(order(lrTest.table$Auc)), ]
      #. 4 save results
      write.csv(cluster_lrTest.table, file = paste(path, "/", i, "_lrTest.csv", sep = ""))
      save(cluster_lrTest.table, file = paste(path, "/", i, "_MIST.RData", sep = ""))
    }
  }
}

# build signature matrix using genes identified by DEAnalysisMAST()
buildSignatureMatrixMAST <- function(scdata,
                                     id,
                                     path,
                                     diff.cutoff = 0.5,
                                     pval.cutoff = 0.01) {
  # compute differentially expressed genes for each cell type
  DEAnalysisMAST(scdata, id, path)

  # for each cell type, choose genes in which FDR adjusted p-value is less than 0.01 and the estimated fold-change
  # is greater than 0.5
  numberofGenes <- c()
  for (i in unique(id)) {
    if (file.exists(paste(path, "/", i, "_MIST.RData", sep = ""))) {
      load(file = paste(path, "/", i, "_MIST.RData", sep = ""))
      pvalue_adjusted <- stats::p.adjust(
          cluster_lrTest.table[, 3],
          method = "fdr",
          n = length(cluster_lrTest.table[, 3]))
      cluster_lrTest.table <- cbind(cluster_lrTest.table, pvalue_adjusted)
      DEGenes <- cluster_lrTest.table$Gene[intersect(
          which(pvalue_adjusted < pval.cutoff),
          which(cluster_lrTest.table$log2fold_change > diff.cutoff))]
      nonMir <- grep("MIR|Mir", DEGenes, invert = T)  # because Mir gene is usually not accurate
      assign(paste("cluster_lrTest.table.", i, sep = ""),
             cluster_lrTest.table[which(cluster_lrTest.table$Gene %in% DEGenes[nonMir]), ])
      numberofGenes <- c(numberofGenes, length(DEGenes[nonMir]))
    }
  }

  # need to reduce number of genes
  # for each subset, order significant genes by decreasing fold change, choose between 50 and 200 genes
  # for each, iterate and choose matrix with lowest condition number
  conditionNumbers <- c()
  for (G in 50:200) {
    Genes <- c()
    j = 1
    for (i in unique(id)) {
      if (numberofGenes[j] > 0) {
        temp <- paste("cluster_lrTest.table.", i, sep = "")
        temp <- as.name(temp)
        temp <- eval(parse(text = temp))
        temp <- temp[order(temp$log2fold_change, decreasing = TRUE), ]
        Genes <- c(Genes, varhandle::unfactor(temp$Gene[1:min(G, numberofGenes[j])]))
      }
      j = j + 1
    }
    Genes <- unique(Genes)
    # make signature matrix
    ExprSubset <- scdata[Genes, ]

    mean.sig <- function(y){
      y <- y[which(id==i)]
      mean.y <- mean(y)
      return(mean.y)
    }

    Sig <- NULL
    for (i in unique(id)) {
      Sig <- cbind(Sig, apply(ExprSubset, 1, mean.sig))

    }
    colnames(Sig) <- unique(id)
    conditionNumbers <- c(conditionNumbers, kappa(Sig))
  }

  G <- which.min(conditionNumbers) + min(49, numberofGenes - 1)
  Genes <- c()
  j = 1
  for (i in unique(id)) {
    if (numberofGenes[j] > 0) {
      temp <- paste("cluster_lrTest.table.", i, sep = "")
      temp <- as.name(temp)
      temp <- eval(parse(text = temp))
      temp <- temp[order(temp$log2fold_change, decreasing = TRUE), ]
      Genes <-
        c(Genes, varhandle::unfactor(temp$Gene[1:min(G, numberofGenes[j])]))
    }
    j = j + 1
  }
  Genes <- unique(Genes)
  ExprSubset <- scdata[Genes, ]
  Sig <- NULL
  for (i in unique(id)) {
    Sig <- cbind(Sig, (apply(ExprSubset, 1, function(y) mean(y[which(id == i)]))))
  }
  colnames(Sig) <- unique(id)
  save(Sig, file = paste(path, "/Sig.RData", sep = ""))
  return(Sig)
}


#' Title
#'
#' @param scdata single data with genes in rows and cells in columns.
#' @param bulkdata A matrix with genes in rows and samples in columns.
#' @param label The cell type label of single data.
#' @importFrom Seurat CreateSeuratObject NormalizeData FindVariableFeatures ScaleData
#' @importFrom BisqueRNA SeuratToExpressionSet
#' @return   A data frame of Mixed cellular fractions.
#' @export
#' @examples
#'
#' # Bulk <- Bulk_GSE60424
#' # SC <- SC_GSE60424
#' # Label <- Label_GSE60424$Label
#' # res <- Dwls(bulkdata = Bulk,
#' #             scdata = SC,
#' #             label = Label)
#'
#'


Dwls <- function(scdata, bulkdata, label) {
  label <- gsub("-","_",label)
  gene <- intersect(rownames(scdata),rownames(bulkdata))
  scdata <- scdata[gene,]
  bulkdata <- bulkdata[gene,]
  movname <-  names(which(table(label)<3))
  for (mov in movname) {
    temp <- which(label==mov)
    label <- label[-temp]
    scdata <-  scdata[,-temp]
  }


   tmp_path <- tempdir()

   filepath <- paste0(tmp_path,"/result_DWLS/")


   if(file.exists(filepath)){
     path <- filepath
   }else{
     dir.create(filepath)
     path <- filepath
   }
   Signature <- buildSignatureMatrixMAST(scdata=as.matrix(scdata),
                                         id=label,
                                         path=filepath,
                                         diff.cutoff=0.5,
                                         pval.cutoff=0.01)
  allCounts_DWLS <- NULL
  for(j in 1:(dim(bulkdata)[2])){
    S <- Signature
    Bulk <- bulkdata[,j]
    names(Bulk) <- rownames(bulkdata)
    Genes <- intersect(rownames(S),names(Bulk))
    B <- Bulk[Genes]
    S <- S[Genes,]
    solDWLS <- solveDampenedWLS(S,B)
    allCounts_DWLS <- cbind(allCounts_DWLS,solDWLS)
  }

  DWLS_res <- allCounts_DWLS

  colnames(allCounts_DWLS) <- colnames(bulkdata)

  res_Dwls <- allCounts_DWLS

  return(res_Dwls)
}
libcell/deconvBench documentation built on Sept. 24, 2022, 12:36 p.m.