R/sc_workflow.R
In scPipe: pipeline for single cell RNA-seq data analysis

Documented in create_processed_report create_report create_sce_by_dir

#' create a SingleCellExperiment object from data folder generated by preprocessing step
#'
#' after we run \code{sc_gene_counting} and finish the preprocessing step. \code{create_sce_by_dir}
#' can be used to generate the \link{SingleCellExperiment} obeject from the folder that contains gene count matrix and QC statistics.
#' it can also generate the html report based on the gene count and quality control statistics
#'
#' @param datadir the directory that contains all the data and `stat` subfolder.
#' @param organism the organism of the data. List of possible names can be retrieved using the function
#' `listDatasets`from `biomaRt` package. (i.e `mmusculus_gene_ensembl` or `hsapiens_gene_ensembl`)
#' @param gene_id_type gene id type of the data A possible list of ids can be retrieved using the function `listAttributes` from `biomaRt` package.
#' the commonly used id types are `external_gene_name`, `ensembl_gene_id` or `entrezgene`
#' @param pheno_data the external phenotype data that linked to each single cell. This should be an \code{AnnotatedDataFrame} object
#' @param report whether to generate the html report in the data folder
#'
#' @details after we run \code{sc_gene_counting} and finish the preprocessing step. \code{create_sce_by_dir}
#' can be used to generate the SingleCellExperiment obeject from the folder that contains gene count matrix and QC statistics.
#'
#' @return a SingleCellExperiment object
#'
#' @importFrom utils read.csv
#' @import SingleCellExperiment
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
#' @importFrom utils packageVersion
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # the sce can be created fron the output folder of scPipe
#' # please refer to the vignettes
#' sce = create_sce_by_dir(datadir="output_dir_of_scPipe",
#'     organism="mmusculus_gene_ensembl",
#'     gene_id_type="ensembl_gene_id")
#' }
#' # or directly from the gene count and quality control matrix:
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' dim(sce)
#'

create_sce_by_dir = function(datadir, organism=NULL, gene_id_type=NULL, pheno_data=NULL, report=FALSE) {
  gene_cnt = read.csv(file.path(datadir, "gene_count.csv"), row.names=1)
  cell_stat = read.csv(file.path(datadir, "stat", "cell_stat.csv"), row.names=1)
  demultiplex_stat = read.csv(file.path(datadir, "stat", "overall_stat.csv"))
  UMI_dup_stat = read.csv(file.path(datadir, "stat", "UMI_duplication_count.csv"))

  gene_cnt = gene_cnt[, order(colnames(gene_cnt))]
  cell_stat = cell_stat[order(rownames(cell_stat)), ]


  sce = SingleCellExperiment(assays = list(counts =as.matrix(gene_cnt)))
  sce@metadata$scPipe$version = packageVersion("scPipe")  # set version information
  if(!is.null(organism)){
    organism(sce) = organism
  }
  if(!is.null(gene_id_type)){
    gene_id_type(sce) = gene_id_type
  }
  QC_metrics(sce) = cell_stat
  if(!is.null(pheno_data)){
    colData(sce) = cbind(colData(sce), pheno_data[order(rownames(pheno_data)),])
  }

  demultiplex_info(sce) = demultiplex_stat
  UMI_dup_info(sce) = UMI_dup_stat
  #if(any(grepl("^ERCC-", rownames(sce)))){
  #  isSpike(sce, "ERCC") <- grepl("^ERCC-", rownames(sce))
  #}


  if(report){
    create_report(sample_name=basename(datadir),
                  outdir=datadir,
                  organism=organism,
                  gene_id_type=gene_id_type)
  }


  return(sce)
}


#' create_report
#'
#' create an HTML report using data generated by proprocessing step.
#'
#' @param sample_name sample name
#' @param outdir output folder
#' @param r1 file path of read1
#' @param r2 file path of read2 default to be NULL
#' @param outfq file path of the output of \code{sc_trim_barcode}
#' @param read_structure a list contains read structure configuration. For more help see `?sc_trim_barcode`
#' @param filter_settings a list contains read filter settings for more help see `?sc_trim_barcode`
#' @param align_bam the aligned bam file
#' @param genome_index genome index used for alignment
#' @param map_bam the mapped bam file
#' @param exon_anno the gff exon annotation used. Can have multiple files
#' @param stnd whether to perform strand specific mapping
#' @param fix_chr add `chr` to chromosome names, fix inconsistant names.
#' @param barcode_anno cell barcode annotation file path.
#' @param max_mis maximum mismatch allowed in barcode. Default to be 1
#' @param UMI_cor correct UMI sequence error: 0 means no correction, 1 means simple correction and merge UMI with distance 1.
#' @param gene_fl whether to remove low abundant gene count. Low abundant is defined as only one copy of one UMI for this gene
#' @param organism the organism of the data. List of possible names can be retrieved using the function
#' `listDatasets`from `biomaRt` package. (i.e `mmusculus_gene_ensembl` or `hsapiens_gene_ensembl`)
#' @param gene_id_type gene id type of the data A possible list of ids can be retrieved using the function `listAttributes` from `biomaRt` package.
#' the commonly used id types are `external_gene_name`, `ensembl_gene_id` or `entrezgene`
#'
#' @return no return
#' @export
#'
#' @examples
#' \dontrun{
#' create_report(sample_name="sample_001",
#'        outdir="output_dir_of_scPipe",
#'        r1="read1.fq",
#'        r2="read2.fq",
#'        outfq="trim.fq",
#'        read_structure=list(bs1=-1, bl1=2, bs2=6, bl2=8, us=0, ul=6),
#'        filter_settings=list(rmlow=TRUE, rmN=TRUE, minq=20, numbq=2),
#'        align_bam="align.bam",
#'        genome_index="mouse.index",
#'        map_bam="aligned.mapped.bam",
#'        exon_anno="exon_anno.gff3",
#'        stnd=TRUE,
#'        fix_chr=FALSE,
#'        barcode_anno="cell_barcode.csv",
#'        max_mis=1,
#'        UMI_cor=1,
#'        gene_fl=FALSE,
#'        organism="mmusculus_gene_ensembl",
#'        gene_id_type="ensembl_gene_id")
#' }
#'
create_report = function(sample_name,
                         outdir,
                         r1="NA",
                         r2="NA",
                         outfq="NA",
                         read_structure=list(bs1=0, bl1=0, bs2=0, bl2=0, us=0, ul=0),
                         filter_settings=list(rmlow = TRUE, rmN = TRUE, minq = 20, numbq = 2),
                         align_bam="NA",
                         genome_index="NA",
                         map_bam="NA",
                         exon_anno="NA",
                         stnd=TRUE,
                         fix_chr=FALSE,
                         barcode_anno="NA",
                         max_mis=1,
                         UMI_cor=1,
                         gene_fl=FALSE,
                         organism,
                         gene_id_type) {
  fn = system.file("extdata", "report_template.Rmd", package = "scPipe")
  tx = readLines(fn)

  tx = gsub(pattern = "SAMPLE_NAME__", replacement = sample_name, x = tx)
  tx = gsub(pattern = "FQ1__", replacement = r1, x = tx)
  if (!is.null(r2)) {
    tx = gsub(pattern = "FQ2__", replacement = r2, x = tx)
  }
  else {
    tx = gsub(pattern = "FQ2__", replacement = "NA", x = tx)
  }

  tx = gsub(pattern = "FQOUT__", replacement = outfq, x = tx)
  if (read_structure$bs1<0) {
    tx = gsub(pattern = "BC1_INFO__", replacement = "NA", x = tx)
  }
  else {
    tx = gsub(pattern = "BC1_INFO__", replacement =
                paste0("start at position ", read_structure$bs1, ", length ", read_structure$bl1), x = tx)
  }

  tx = gsub(pattern = "BC2_INFO__", replacement =
              paste0("start at position ", read_structure$bs2, ", length ", read_structure$bl2), x = tx)
  tx = gsub(pattern = "UMI_INFO__", replacement =
              paste0("start at position ", read_structure$us, ", length ", read_structure$ul), x = tx)

  tx = gsub(pattern = "RM_N__", replacement = as.character(filter_settings$rmN), x = tx)
  tx = gsub(pattern = "RM_LOW__", replacement = as.character(filter_settings$rmlow), x = tx)
  tx = gsub(pattern = "MIN_Q__", replacement = filter_settings$minq, x = tx)
  tx = gsub(pattern = "NUM_BQ__", replacement = filter_settings$numbq, x = tx)

  tx = gsub(pattern = "BAM_ALIGN__", replacement = align_bam, x = tx)
  tx = gsub(pattern = "G_INDEX__", replacement = genome_index, x = tx)
  tx = gsub(pattern = "BAM_MAP__", replacement = map_bam, x = tx)

  tx = gsub(pattern = "OUTDIR__", replacement = outdir, x = tx)
  tx = gsub(pattern = "ANNO_GFF__", replacement = paste(exon_anno, collapse=", "), x = tx)

  tx = gsub(pattern = "STND__", replacement = as.character(stnd), x = tx)
  tx = gsub(pattern = "FIX_CHR__", replacement = as.character(fix_chr), x = tx)
  tx = gsub(pattern = "BC_ANNO__", replacement = barcode_anno, x = tx)
  tx = gsub(pattern = "MAX_MIS__", replacement = max_mis, x = tx)

  if (UMI_cor == 1) {
    tx = gsub(pattern = "UMI_COR__", replacement = "simple correction and merge UMI with distance 1", x = tx)
  }
  else if (UMI_cor == 0) {
    tx = gsub(pattern = "UMI_COR__", replacement = "no correction", x = tx)
  }
  else {
    tx = gsub(pattern = "UMI_COR__", replacement = "unknown", x = tx)
  }

  # If organism and gene id type are not provided, delete them from param list
  # of rmd. param$organism and param$gene_id_type will then return NULL when
  # when used in code.
  tx = gsub(pattern = "GENE_FL__", replacement = as.character(gene_fl), x = tx)
  if(!missing(organism) && !is.null(organism)){
      tx = gsub(pattern = "ORGANISM__", replacement = organism, x = tx)
  }else{
    tx = tx[!grepl(pattern = "ORGANISM__", x = tx)]
  }

  if(!missing(gene_id_type) && !is.null(gene_id_type)){
      tx = gsub(pattern = "GENE_ID_TYPE__", replacement = gene_id_type, x = tx)
  }else{
    tx = tx[!grepl(pattern = "GENE_ID_TYPE__", x = tx)]
  }

  writeLines(tx, con=file.path(outdir, "report.Rmd"))
  knitr::wrap_rmd(file.path(outdir, "report.Rmd"), width = 120, backup = NULL)
  rmarkdown::render(file.path(outdir, "report.Rmd"), output_file = file.path(outdir, "report.html"), knit_root_dir = ".")
}

#' create_processed_report
#'
#' Create an HTML report summarising pro-processed data. This is an alternative to the more verbose \code{create_report} that requires only the processed counts and stats folders.
#' @param outdir output folder.
#' @param organism the organism of the data. List of possible names can be retrieved using the function
#' `listDatasets`from `biomaRt` package. (e.g. `mmusculus_gene_ensembl` or `hsapiens_gene_ensembl`).
#' @param gene_id_type gene id type of the data A possible list of ids can be retrieved using the function `listAttributes` from `biomaRt` package.
#' the commonly used id types are `external_gene_name`, `ensembl_gene_id` or `entrezgene`.
#' @param report_name the name of the report .Rmd and .html files.
#'
#' @examples
#' \dontrun{
#' create_report(
#'        outdir="output_dir_of_scPipe",
#'        organism="mmusculus_gene_ensembl",
#'        gene_id_type="ensembl_gene_id")
#' }
#'
#' @export

create_processed_report <- function(
  outdir = ".",
  organism,
  gene_id_type,
  report_name = "report"
) {
  fn <-  system.file("extdata", "report_template_slim.Rmd", package = "scPipe")
  tx <- readLines(fn)
  fill_report_field <- function(field, value) {

    pattern <- paste0(field, "__")

    if (is.na(value)) {
      tx <- tx[-grep(pattern, tx)]
    } else {
      gsub(pattern, value, tx)
    }
  }

  if (!missing(organism) && !is.null(organism)) {
    tx <- fill_report_field("ORGANISM", organism)
  } else {
    tx <- fill_report_field("ORGANISM", NA)
  }

  if (!missing(gene_id_type) && !is.null(gene_id_type)) {
    tx <- fill_report_field("GENE_ID_TYPE", gene_id_type)
  } else {
    tx <- fill_report_field("GENE_ID_TYPE", NA)
  }

  report_path <- file.path(outdir, paste0(report_name, ".Rmd"))
  tx <- tx[!is.na(tx)]
  writeLines(tx, con = report_path)
  rmarkdown::render(
      input = report_path,
      envir = new.env(),
      knit_root_dir = "."
  )
}