R/design_data.R

Defines functions design_joint design_plot_joint design_sep design_plot_sep design_data

Documented in design_data design_joint design_sep

#' use scDesign to simulate scRNA-seq data
#' @param realcount A numeric matrix with rows representing genes and columns representing cells.
#' Gene names are given as row names.
#' @param S A number specifying the total number of RNA-seq reads. Default to 1e8.
#' When \code{ngroup > 1}, \code{S} should be a vector specifying the read number in each cell state.
#' @param ncell An integer specifying the number of cells. When \code{ngroup > 1},
#' \code{ncell} is vector specifying the number of cells in each cell state.
#' @param ngroup An integer giving the number of cell states to simulate. Defaults to 1.
#' @param pUp A value between 0 and 1 specifying the proportion of up regulated genes
#' between two adjacent cell states. Defaults to 0.05 and only used when \code{ngroup > 1}.
#' @param pDown A value between 0 and 1 specifying the proportion of down regulated genes
#' between two adjacent cell states. Defaults to 0.05 and only used when \code{ngroup > 1}.
#' @param fU A value specifying the upper bound of fold changes of differentially expressed genes.
#' Deaults to 5.
#' @param fL A value specifying the lower bound of fold changes of differentially expressed genes.
#' Deaults to 1.5.
#' @param ncores An integer specifying the number of cores used for parallel computation.
#' Defaults to 1.
#' @param exprmean A named vector of user-specified gene mean expression parameters.
#' The names of \code{exprmean} should match the rownames of \code{realcount} in the same order.
#' Supplied gene expression mean should be on the $log_10$ scale.
#' @return When \code{ngroup = 1}, \code{design_data} returns a simulated count matrix with
#' rows representing genes and columns representing cells.
#' When \code{ngroup > 1}, \code{design_data} returns a list of \code{ngroup} elements.
#' The g-th element corresponds to the g-th cell state, and is a list containing three elements:
#' \describe{
#'   \item{count:}{a count matrix with rows representing genes and columns representing cells;}
#'   \item{genesUp:}{a character vector giving the names of up-regulated genes from state g-1 to g;}
#'   \item{genesDown:}{a character vector giving the names of down-regulated genes from state g-1 to g.}
#' }
#' @export
#' @import parallel
#' @importFrom stats complete.cases dgamma dnorm prcomp quantile rgamma rnorm sd uniroot median pchisq
#' @author Wei Vivian Li, \email{liw@ucla.edu}
#' @author Jingyi Jessica Li, \email{jli@stat.ucla.edu}
design_data = function(realcount, S = 1e8, ncell, ngroup = 1,
                       pUp = 0.05, pDown = 0.05, fU = 5, fL = 1.5, ncores = 1,
                       exprmean = NULL){
  estpa = estimate_pa(realcount, ncores = ncores)
  if(is.null(exprmean)){
    if(ngroup == 1){
      simu_res = simulate_new_ofo(realcount, Js = ncell, estpa = estpa, S = S)
    }else{
      simu_res = simulate_de_mfo(realcount, estpa, Js = ncell,
                                 ngroup = ngroup, S = S,
                                 pUp = pUp, pDown = pDown, fU = fU, fL = fL)
    }
  }else{
    flag = sum(names(exprmean) != rownames(realcount))
    if(sum(flag) > 1)stop("Names of exprmean must match rownames of realcount!")
    if(sum(is.na(exprmean)) > 1)stop("exprmean must not contain NA values!")

    if(ngroup == 1){
      simu_res = simulate_ofo(estpa = estpa, Js = ncell, S = S, exprmean = exprmean)
    }else{
      simu_res = simulate_de_mfo(realcount, estpa, Js = ncell,
                                 ngroup = ngroup, S = S,
                                 pUp = pUp, pDown = pDown, fU = fU, fL = fL, exprmean = exprmean)
    }
  }

  return(simu_res)
}


design_plot_sep = function(prlist, ncell, plot_dir, pvals){
  theme_set(theme_bw())
  measures = c("precision", "recall", "TN", "F1", "F2")
  bestnum = sapply(prlist, function(mat){
    nums = sapply(1:nrow(mat), function(i){
      paste0(ncell[which.max(mat[i, ]), ], collapse = "vs")
    })
    num = names(sort(table(nums), decreasing=TRUE)[1])
    return(num)
  })
  measures = c("precision", "recall (TP)", "TN",
               "F1 (precision vs. recall)", "F2 (TP vs. TN)")
  numdata = data.frame(metric = measures, "cell number" = bestnum)

  prmat = lapply(1:5, function(k){
    mat = prlist[[k]]
    data = as.data.frame(mat)
    colnames(data) = paste0(ncell[,1], "vs", ncell[,2])
    data$pval = -log10(pvals)
    data$measure = measures[k]
    data = data %>% gather(ncell, value, -(measure:pval))
    data$ncell = factor(data$ncell, levels = paste0(ncell[,1], "vs", ncell[,2]))
    return(data)
  })
  red = "#CC0C00"; lred = "#EEAEAA"
  blue = "#5C88DA"; lblue = "#C8D7F2"
  yellow = "#FFB200"; lyellow = "#FFE5AA"

  ylabs = c("precision", "recall (TP)", "TN", "F1", "F2")
  plots = lapply(1:5, function(k){
    if(k %in% c(1,3)){low = lblue; high = blue}
    if(k == 2){low = lred; high = red}
    if(k %in% c(4,5)){low = lyellow; high = yellow}

    g1 = ggplot(prmat[[k]], aes(x = ncell, y = value, group = pval, color = pval)) +
      geom_line()+ scale_color_gradient(low = low, high = high, name = "-log(p)") +
      xlab("") + ylab(ylabs[k]) +
      # labs(color = "-log(threshold)") +
      theme(text = element_text(size=16), axis.text = element_text(size=16),
            legend.title = element_text(size=16),
            legend.text = element_text(size=16),
            legend.key.size = unit(2, 'lines'),
            axis.text.x = element_text(angle = 45, hjust = 1))
    return(g1)
  })

  mytheme = gridExtra::ttheme_default(base_size = 20)
  gtable = tableGrob(numdata, rows = NULL, theme = mytheme)
  width = nrow(ncell) * 1.5
  if(nrow(ncell) <= 3) width = 5
  height = 5.5
  if(max(ncell)>=1e4) height = height + 0.5*(log10(max(ncell))-4)
  pdf(paste0(plot_dir, "design_summary.pdf"),
      width = width, height = height)
  grid.draw(plots[[1]])
  grid.draw(plots[[2]])
  grid.draw(plots[[3]])
  grid.draw(plots[[4]])
  grid.draw(plots[[5]])
  grid.arrange(gtable)
  dev.off()
  return(0)
}


#' use scDesign to make experimental design assuming two cell states are sequenced independently
#' @param realcount1 A numeric matrix with rows representing genes and columns representing cells (cell state 1).
#' Gene names are given as row names.
#' @param realcount2 A numeric matrix with rows representing genes and columns representing cells (cell state 2).
#' Gene names are given as row names.
#' @param S1 A number specifying the total number of RNA-seq reads for cell state 1. Default to 1e8.
#' @param S2 A number specifying the total number of RNA-seq reads for cell state 2. Default to 1e8.
#' @param ncell A two-column matrix specifying the numbers of cells. Column 1 is for cell state 1 and
#' column 2 is for cell state 2. By default, the following cell number matrix is used:
#' \tabular{rr}{
#' 64 \tab 64 \cr
#' 128 \tab 128 \cr
#' 256 \tab 256 \cr
#' 512 \tab 512 \cr
#' 1024 \tab 1024 \cr
#' 2048 \tab 2048 \cr
#' 4096 \tab 4096 \cr
#' }
#' @param B An integer giving the number of experiments to repeat in order the calculate
#' the average DE analysis accuracy. Defaults to 100.
#' @param de_method A character specifying the differential expression analysis method to use.
#' Currently supports "ttest" (default) or "mast".
#' @param p_thre A numeric vector specifying the FDR thresholds used to identify
#' differentially expressed genes. Defaults to \code{10^seq(-2,-6,-1)}.
#' @param plot_dir A character giving the directory to save experimental design results
#' Defaults to "./".
#' @param ncores An integer specifying the number of cores used for parallel computation.
#' Defaults to 1.
#' @param rank An integer specifying the number of top DE genes to identify
#' from scImpute's results as the standard in DE analysis. Defaults to 1000.
#' @return A list of five elments:
#' \describe{
#'   \item{precision:}{a matrix of precision. }
#'   \item{recall:}{a matrix of recall (true positive rate).}
#'   \item{TN:}{a matrix of TN (true negative rate).}
#'   \item{F1:}{a matrix of F1 (precision vs. recall).}
#'   \item{F2:}{a matrix of F2 (TN vs. recall).}
#' }
#' In all the matrices, rows correspond to different FDR thresholds and
#' columns correspond to the cell numbers  specified in \code{ncell}.
#' \code{design_sep} also writes the list to design_summary.txt and
#' saves it to \code{plot_dir}.
#' The corresponding plots are also saved to \code{plot_dir}.
#' @export
#' @import parallel
#' @import grid
#' @import MAST
#' @importFrom SummarizedExperiment assay colData
#' @importFrom GenomicRanges mcols
#' @importFrom gridExtra ttheme_default tableGrob grid.arrange
#' @import ggplot2
#' @importFrom dplyr %>%
#' @importFrom tidyr gather
#' @importFrom utils write.table
#' @importFrom grDevices dev.off pdf
#' @importFrom data.table as.data.table setorder
#' @importFrom stats coef p.adjust rbinom rmultinom runif t.test var
#' @author Wei Vivian Li, \email{liw@ucla.edu}
#' @author Jingyi Jessica Li, \email{jli@stat.ucla.edu}
design_sep = function(realcount1, realcount2,
                        S1 = 1e8, S2 = 1e8, ncell = NULL, B = 100,
                        de_method = "ttest", p_thre = 10^seq(-2,-6,-1),
                        plot_dir = "./",
                        ncores = 1, rank = 1000){
  if(is.null(ncell)){
    message("Using default cell numbers ... ")
    ncell = cbind(round(2^seq(6,12,1)), round(2^seq(6,12,1)))
  }
  if(class(ncell) != "matrix") stop("ncell must be a matrix!")
  if(ncol(ncell) != 2) stop("ncell must have two columns!")
  if(is.null(rownames(realcount1))) stop("realcount1 must have rownames!")
  if(is.null(rownames(realcount1))) stop("realcount1 must have rownames!")

  ncell = data.frame(ncell)
  colnames(ncell) = c("n1", "n2")
  ncell = ncell[with(ncell, order(n1, n2)), , drop = FALSE]

  estpa1 = estimate_pa(realcount1, ncores = ncores)
  estpa2 = estimate_pa(realcount2, ncores = ncores)

  truede = de_scimpute(estpa1$pa, estpa2$pa, ncores = ncores, by = "rank", rank = rank)

  prlist = get_pr_mfdr(estpa1, estpa2, B=B, ncell=ncell, S1=S1, S2=S2,
                       de_method = de_method, degenes = truede,
                       fdr_thres = p_thre, ncores = ncores)
  dir.create(plot_dir, recursive = TRUE)
  design_plot_sep(prlist, ncell, plot_dir = plot_dir, pvals = p_thre)

  for(i in 1:5){
    cat(names(prlist)[i], file=paste0(plot_dir,"design_summary.txt"),
        sep="\n", append = TRUE)
    mat = round(prlist[[i]], digits = 3)
    mat1 = data.frame(rownames(mat), mat)
    colnames(mat1) = c("p_thre", colnames(mat))
    write.table(mat1, file=paste0(plot_dir,"design_summary.txt"),
                row.names=FALSE, col.names=TRUE, append = TRUE, quote = FALSE)
    cat("\n", file=paste0(plot_dir,"design_summary.txt"), append = TRUE)
  }
  return(prlist)
}


design_plot_joint = function(prlist, ncell, plot_dir, pvals){
  theme_set(theme_bw())
  measures = c("precision", "recall", "TN", "F1", "F2")
  bestnum = sapply(prlist, function(mat){
    nums = sapply(1:nrow(mat), function(i){
      as.character(ncell[which.max(mat[i, ])])
    })
    num = names(sort(table(nums), decreasing=TRUE)[1])
    return(num)
  })
  measures = c("precision", "recall (TP)", "TN",
               "F1 (precision vs. recall)", "F2 (TP vs. TN)")
  numdata = data.frame(metric = measures, "cell number" = bestnum)

  prmat = lapply(1:5, function(k){
    mat = prlist[[k]]
    data = as.data.frame(mat)
    colnames(data) = ncell
    data$pval = -log10(pvals)
    data$measure = measures[k]
    data = data %>% gather(ncell, value, -(measure:pval))
    data$ncell = factor(data$ncell, levels = ncell)
    return(data)
  })
  red = "#CC0C00"; lred = "#EEAEAA"
  blue = "#5C88DA"; lblue = "#C8D7F2"
  yellow = "#FFB200"; lyellow = "#FFE5AA"

  ylabs = c("precision", "recall (TP)", "TN", "F1", "F2")
  plots = lapply(1:5, function(k){
    if(k %in% c(1,3)){low = lblue; high = blue}
    if(k == 2){low = lred; high = red}
    if(k %in% c(4,5)){low = lyellow; high = yellow}

    g1 = ggplot(prmat[[k]], aes(x = ncell, y = value, group = pval, color = pval)) +
      geom_line()+ scale_color_gradient(low = low, high = high, name = "-log(p)") +
      xlab("") + ylab(ylabs[k]) +
      # labs(color = "-log(threshold)") +
      theme(text = element_text(size=16), axis.text = element_text(size=16),
            legend.title = element_text(size=16),
            legend.text = element_text(size=16),
            legend.key.size = unit(2, 'lines'),
            axis.text.x = element_text(angle = 45, hjust = 1))
    return(g1)
  })

  mytheme = gridExtra::ttheme_default(base_size = 20)
  gtable = tableGrob(numdata, rows = NULL, theme = mytheme)
  width = length(ncell) * 1.5
  if(length(ncell) <= 3) width = 5
  height = 5.5
  if(max(ncell)>=1e4) height = height + 0.5*(log10(max(ncell))-4)
  pdf(paste0(plot_dir, "design_summary.pdf"),
      width = width, height = height)
  grid.draw(plots[[1]])
  grid.draw(plots[[2]])
  grid.draw(plots[[3]])
  grid.draw(plots[[4]])
  grid.draw(plots[[5]])
  grid.arrange(gtable)
  dev.off()
  return(0)
}



#' use scDesign to make experimental design assuming two cell states are sequenced together
#' @param realcount1 A numeric matrix with rows representing genes and columns representing cells (cell state 1).
#' Gene names are given as row names.
#' @param realcount2 A numeric matrix with rows representing genes and columns representing cells (cell state 2).
#' Gene names are given as row names.
#' @param prop1 A number giving the proportion of state 1 cells in the cell population.
#' @param prop2 A number giving the proportion of state 2 cells in the cell population.
#' @param S A number specifying the total number of RNA-seq reads for the cell population. Default to 1e8.
#' @param ncell An integer vector specifying the total number of cells to sequence.
#' Defaults to \code{2^seq(6,13,1)}.
#' @param B An integer giving the number of experiments to repeat in order the calculate
#' the average DE analysis accuracy. Defaults to 100.
#' @param de_method A character specifying the differential expression analysis method to use.
#' Currently supports "ttest" (default) or "mast".
#' @param p_thre A numeric vector specifying the FDR thresholds used to identify
#' differentially expressed genes. Defaults to \code{10^seq(-2,-6,-1)}.
#' @param plot_dir A character giving the directory to save experimental design results
#' Defaults to "./".
#' @param ncores An integer specifying the number of cores used for parallel computation.
#' Defaults to 1.
#' @param rank An integer specifying the number of top DE genes to identify
#' from scImpute's results as the standard in DE analysis. Defaults to 1000.
#' @return A list of five elments:
#' \describe{
#'   \item{precision:}{a matrix of precision. }
#'   \item{recall:}{a matrix of recall (true positive rate).}
#'   \item{TN:}{a matrix of TN (true negative rate).}
#'   \item{F1:}{a matrix of F1 (precision vs. recall).}
#'   \item{F2:}{a matrix of F2 (TN vs. recall).}
#' }
#' In all the matrices, rows correspond to different FDR thresholds and
#' columns correspond to the cell numbers  specified in \code{ncell}.
#' \code{design_joint} also writes the list to design_summary.txt and
#' saves it to \code{plot_dir}.
#' The corresponding plots are also saved to \code{plot_dir}.
#' @export
#' @import parallel
#' @import grid
#' @import MAST
#' @importFrom SummarizedExperiment assay colData
#' @importFrom GenomicRanges mcols
#' @importFrom gridExtra ttheme_default tableGrob grid.arrange
#' @import ggplot2
#' @importFrom dplyr %>%
#' @importFrom tidyr gather
#' @importFrom utils write.table
#' @importFrom grDevices dev.off pdf
#' @importFrom data.table as.data.table setorder
#' @importFrom stats coef p.adjust rbinom rmultinom runif t.test var
#' @author Wei Vivian Li, \email{liw@ucla.edu}
#' @author Jingyi Jessica Li, \email{jli@stat.ucla.edu}
design_joint = function(realcount1, realcount2, prop1, prop2,
                        S = 1e8, ncell = round(2^seq(6,13,1)), B = 100,
                        de_method = "ttest", p_thre = 10^seq(-2,-6,-1),
                        plot_dir = "./",
                        ncores = 1, rank = 1000){

  if(!is.numeric(ncell)) stop("ncell must be an integer vector!")
  if(length(ncell) == 1) stop("ncell must have at least two elements!")
  if(prop1 + prop2 >1) stop("prop1 + prop2 > 1!")
  genes_int = intersect(rownames(realcount1), rownames(realcount2))
  if(length(genes_int) < 1000) stop("two real datasets have less than 1000 common genes!")
  realcount1 = realcount1[genes_int, ]
  realcount2 = realcount2[genes_int, ]

  estpa1 = estimate_pa(realcount1, ncores = ncores)
  estpa2 = estimate_pa(realcount2, ncores = ncores)

  truede = de_scimpute(estpa1$pa, estpa2$pa, ncores = ncores, by = "rank", rank = rank)

  prlist = get_pr_mfdr_joint(estpa1, estpa2, B=B, Js_vec = ncell,
                             prop1 = prop1, prop2 = prop2, S=S,
                             de_method = de_method, degenes = truede, by = "fdr",
                             fdr_thres = p_thre, ncores = ncores)
  dir.create(plot_dir, recursive = TRUE)
  design_plot_joint(prlist, ncell, plot_dir = plot_dir, pvals = p_thre)

  for(i in 1:5){
    cat(names(prlist)[i], file=paste0(plot_dir,"design_summary.txt"),
        sep="\n", append = TRUE)
    mat = round(prlist[[i]], digits = 3)
    mat1 = data.frame(rownames(mat), mat)
    colnames(mat1) = c("p_thre", colnames(mat))
    write.table(mat1, file=paste0(plot_dir,"design_summary.txt"),
                row.names=FALSE, col.names=TRUE, append = TRUE, quote = FALSE)
    cat("\n", file=paste0(plot_dir,"design_summary.txt"), append = TRUE)
  }

  return(prlist)
}
Vivianstats/scDesign documentation built on Dec. 17, 2020, 8:04 a.m.