R/wrapper_scPipeCPP.R

Defines functions sc_count_aligned_bam sc_demultiplex_and_count sc_detect_bc sc_gene_counting sc_correct_bam_bc sc_demultiplex sc_exon_mapping sc_trim_barcode

Documented in sc_correct_bam_bc sc_count_aligned_bam sc_demultiplex sc_demultiplex_and_count sc_detect_bc sc_exon_mapping sc_gene_counting sc_trim_barcode

#' sc_trim_barcode
#'
#' @description Reformat fastq files so barcode and UMI sequences are moved from
#'   the sequence into the read name.
#'
#'@details Positions used in this function are 0-indexed, so they start from 0
#'  rather than 1. The default read structure in this function represents
#'  CEL-seq paired-ended reads. This contains a transcript in the first read, a
#'  UMI in the first 6bp of the second read followed by a 8bp barcode. So the
#'  read structure will be : \code{list(bs1=-1, bl1=0, bs2=6, bl2=8, us=0,
#'  ul=6)}. \code{bs1=-1, bl1=0} indicates negative start position and zero
#'  length for the barcode on read one, this is used to denote "no barcode" on
#'  read one. \code{bs2=6, bl2=8} indicates there is a barcode in read two that
#'  starts at the 7th base with length 8bp. \code{us=0, ul=6} indicates a UMI
#'  from first base of read two and the length in 6bp.
#'
#'  For a typical Drop-seq experiment the read structure will be
#'  \code{list(bs1=-1, bl1=0, bs2=0, bl2=12, us=12, ul=8)}, which means the read
#'  one only contains transcript, the first 12bp in read two are cell barcode, followed
#'  by a 8bp UMI.
#'
#' @name sc_trim_barcode
#' @param outfq the output fastq file, which reformat the barcode and UMI into
#'   the read name. Files ending in \code{.gz} will be automatically compressed.
#' @param r1 read one for pair-end reads. This read should contain
#'   the transcript.
#' @param r2 read two for pair-end reads, NULL if single read.
#'   (default: NULL)
#' @param read_structure a list containing the read structure configuration:
#'   \itemize{
#'     \item{bs1}: starting position of barcode in read one. -1 if no barcode in
#'       read one.
#'     \item{bl1}: length of barcode in read one, if there is no
#'       barcode in read one this number is used for trimming beginning of read
#'       one.
#'     \item{bs2}: starting position of barcode in read two
#'     \item{bl2}: length of barcode in read two
#'     \item{us}: starting position of UMI
#'     \item{ul}: length of UMI
#'   }
#' @param filter_settings A list contains read filter settings:\itemize{
#'  \item{rmlow} whether to remove the low quality reads.
#'  \item{rmN} whether to remove reads that contains N in UMI or cell barcode.
#'  \item{minq} the minimum base pair quality that we allowed
#'  \item{numbq} the maximum number of base pair that have quality
#'  below \code{numbq}
#'  }
#' @export
#' @return generates a trimmed fastq file named \code{outfq}
#'
#' @examples
#' data_dir="celseq2_demo"
#' \dontrun{
#' # for the complete workflow, refer to the vignettes
#' ...
#' sc_trim_barcode(file.path(data_dir, "combined.fastq"),
#'    file.path(data_dir, "simu_R1.fastq"),
#'    file.path(data_dir, "simu_R2.fastq"))
#' ...
#' }
sc_trim_barcode = function(outfq, r1, r2=NULL,
                           read_structure = list(
                             bs1=-1, bl1=0, bs2=6, bl2=8, us=0, ul=6),
                           filter_settings = list(
                             rmlow=TRUE, rmN=TRUE, minq=20, numbq=2)) {

  outdir <- regmatches(outfq, regexpr(".*/", outfq))
  if (outdir != character(0) && !dir.exists(outdir))
    dir.create(outdir, recursive = TRUE)

  if (filter_settings$rmlow) {
    i_rmlow = 1
  }
  else {
    i_rmlow = 0
  }
  if (filter_settings$rmN) {
    i_rmN = 1
  }
  else {
    i_rmN = 0
  }

  if (substr(outfq, nchar(outfq) - 2, nchar(outfq)) == ".gz") {
    write_gz = TRUE
  }
  else {
    write_gz = FALSE
  }

  if (!is.null(r2)) {
    if (!file.exists(r1)) {stop("read1 fastq file does not exists.")}
    if (!file.exists(r2)) {stop("read2 fastq file does not exists.")}

    # expand tilde to home path for downstream gzopen() call
    r1 = path.expand(r1)
    r2 = path.expand(r2)

    rcpp_sc_trim_barcode_paired(outfq, r1, r2,
                                read_structure$bs1,
                                read_structure$bl1,
                                read_structure$bs2,
                                read_structure$bl2,
                                read_structure$us,
                                read_structure$ul,
                                i_rmlow,
                                i_rmN,
                                filter_settings$minq,
                                filter_settings$numbq,
                                write_gz)
  }
  else {
    stop("not implemented.")
  }
}


#' sc_exon_mapping
#'
#' @description Map aligned reads to exon annotation.
#' The result will be written into optional fields in bam file with different
#' tags. Following this link for more information regarding to bam file format:
#' http://samtools.github.io/hts-specs
#'
#' The function can accept multiple bam file as input, if multiple bam file is
#' provided and the `bc_len` is zero, then the function will use the barcode in
#' the `barcode_vector` to insert into the `bc` bam tag. So the length of
#' `barcode_vector` and the length of `inbam` should be the same
#' If this is the case then the `max_mis` argument in `sc_demultiplex`
#' should be zero. If `be_len` is larger than zero, then the function will
#' still seek for barcode in fastq headers with given length. In this case
#' each bam file is not treated as from a single cell.
#'
#'
#' @name sc_exon_mapping
#' @param inbam input aligned bam file. can have multiple files as input
#' @param outbam output bam filename
#' @param annofn single string or vector of gff3 annotation filenames,
#'   data.frame in SAF format or GRanges object containing complete gene_id
#'   metadata column.
#' @param bam_tags list defining BAM tags where mapping information is
#'   stored.
#'   \itemize{
#'     \item "am": mapping status tag
#'     \item "ge": gene id
#'     \item "bc": cell barcode tag
#'     \item "mb": molecular barcode tag
#'   }
#' @param bc_len total barcode length
#' @param barcode_vector a list of barcode if each individual bam is a single
#' cell. (default: NULL). The barcode should be of the same length for each
#' cell.
#' @param UMI_len UMI length
#' @param stnd TRUE to perform strand specific mapping. (default: TRUE)
#' @param fix_chr TRUE to add `chr` to chromosome names, MT to chrM. (default: FALSE)
#' @param nthreads number of threads to use. (default: 1)
#'
#' @export
#' @return generates a bam file with exons assigned
#' @examples
#' data_dir="celseq2_demo"
#' ERCCanno_fn = system.file("extdata", "ERCC92_anno.gff3",
#'     package = "scPipe")
#' \dontrun{
#' # for the complete workflow, refer to the vignettes
#' ...
#' sc_exon_mapping(file.path(data_dir, "out.aln.bam"),
#'                 file.path(data_dir, "out.map.bam"),
#'                 ERCCanno_fn)
#' ...
#' }
#'
sc_exon_mapping = function(inbam, outbam, annofn,
                            bam_tags = list(am="YE", ge="GE", bc="BC", mb="OX"),
                            bc_len=8, barcode_vector="", UMI_len=6, stnd=TRUE, fix_chr=FALSE,
                            nthreads=1) {
  if (stnd) {
    i_stnd = 1
  }
  else {
    i_stnd = 0
  }
  if (fix_chr) {
    i_fix_chr = 1
  }
  else {
    i_fix_chr = 0
  }

  if (any(!file.exists(inbam))) {
    stop("At least one input bam file does not exist")
  } else {
    inbam = path.expand(inbam)
  }

  outbam = path.expand(outbam)

  # if (length(inbam) > 1) {
  #   stop("Only one bam file can be used as input")
  # }

  if (!is(annofn, "character") &&
      !is(annofn, "GRanges") &&
      !is(annofn, "data.frame")) {
    stop("'annofn' must be either character vector, GRanges, or data.frame object")
  }

  if (is(annofn, "character")) {
    if (any(!file.exists(annofn))) {
      stop("At least one genome annotation file does not exist")
    } else {
      annofn = path.expand(annofn)
    }
    rcpp_sc_exon_mapping_df_anno(inbam, outbam, anno_import(annofn), bam_tags$am, bam_tags$ge, bam_tags$bc, bam_tags$mb, bc_len,
                                 barcode_vector, UMI_len, stnd, fix_chr, nthreads)
  } else if (is(annofn, "GRanges")) {
    rcpp_sc_exon_mapping_df_anno(inbam, outbam, anno_to_saf(annofn), bam_tags$am, bam_tags$ge, bam_tags$bc, bam_tags$mb, bc_len,
                                 barcode_vector, UMI_len, stnd, fix_chr, nthreads)
  } else if (is(annofn, "data.frame")) {
    validate_saf(annofn)
    rcpp_sc_exon_mapping_df_anno(inbam, outbam, annofn, bam_tags$am, bam_tags$ge, bam_tags$bc, bam_tags$mb, bc_len,
                                  barcode_vector, UMI_len, stnd, fix_chr, nthreads)
  }
}


#' sc_demultiplex
#'
#' @description Process bam file by cell barcode,
#' output to outdir/count/[cell_id].csv.
#' the output contains information for all reads that can be mapped to exons.
#' including the gene id, UMI of that read and the distance to transcript end
#' position.
#'
#' @param inbam input bam file. This should be the output of
#' \code{sc_exon_mapping}
#' @param outdir output folder
#' @param bc_anno barcode annotation, first column is cell id, second column
#' is cell barcode sequence
#' @param max_mis maximum mismatch allowed in barcode. (default: 1)
#' @param bam_tags list defining BAM tags where mapping information is
#'   stored.
#'   \itemize{
#'     \item "am": mapping status tag
#'     \item "ge": gene id
#'     \item "bc": cell barcode tag
#'     \item "mb": molecular barcode tag
#'   }
#' @param mito mitochondrial chromosome name.
#' This should be consistant with the chromosome names in the bam file.
#' @param has_UMI whether the protocol contains UMI (default: TRUE)
#' @param nthreads number of threads to use. (default: 1)
#'
#' @export
#' @return no return
#' @examples
#' data_dir="celseq2_demo"
#' barcode_annotation_fn = system.file("extdata", "barcode_anno.csv",
#'     package = "scPipe")
#' \dontrun{
#' # refer to the vignettes for the complete workflow
#' ...
#' sc_demultiplex(file.path(data_dir, "out.map.bam"),
#'     data_dir,
#'     barcode_annotation_fn,has_UMI=FALSE)
#' ...
#' }
#'
sc_demultiplex = function(inbam, outdir, bc_anno,
                          max_mis=1,
                          bam_tags = list(am="YE", ge="GE", bc="BC", mb="OX"),
                          mito="MT",
                          has_UMI=TRUE,
                          nthreads = 1) {
  dir.create(file.path(outdir, "count"), showWarnings = FALSE)
  dir.create(file.path(outdir, "stat"), showWarnings = FALSE)

  if (!dir.exists(outdir))
    dir.create(outdir, recursive = TRUE)

  outdir = path.expand(outdir)

  if (!file.exists(inbam)) {
    stop("input bam file does not exists.")
  } else {
    inbam = path.expand(inbam)
  }
  if (!file.exists(bc_anno)) {
    stop("barcode annotation file does not exists.")
  } else {
    bc_anno = path.expand(bc_anno)
  }
  rcpp_sc_demultiplex(inbam, outdir, bc_anno, max_mis,
                      bam_tags$am, bam_tags$ge, bam_tags$bc, bam_tags$mb,
                      mito, has_UMI, nthreads)
}


#' sc_correct_bam_bc
#'
#' @description update the cell barcode tag in bam file with corrected barcode
#' output to a new bam file. the function will be useful for methods
#' that use the cell barcode information from bam file, such as `Demuxlet`
#'
#' @param inbam input bam file. This should be the output of
#' \code{sc_exon_mapping}
#' @param outbam output bam file with updated cell barcode
#' @param bc_anno barcode annotation, first column is cell id, second column
#' is cell barcode sequence
#' @param max_mis maximum mismatch allowed in barcode. (default: 1)
#' @param bam_tags list defining BAM tags where mapping information is
#'   stored.
#'   \itemize{
#'     \item "am": mapping status tag
#'     \item "ge": gene id
#'     \item "bc": cell barcode tag
#'     \item "mb": molecular barcode tag
#'   }
#' @param mito mitochondrial chromosome name.
#' This should be consistant with the chromosome names in the bam file.
#' @param nthreads number of threads to use. (default: 1)
#'
#' @export
#' @return no return
#' @examples
#' data_dir="celseq2_demo"
#' barcode_annotation_fn = system.file("extdata", "barcode_anno.csv",
#'     package = "scPipe")
#' \dontrun{
#' # refer to the vignettes for the complete workflow
#' ...
#' sc_correct_bam_bc(file.path(data_dir, "out.map.bam"),
#'     file.path(data_dir, "out.map.clean.bam"),
#'     barcode_annotation_fn)
#' ...
#' }
#'
sc_correct_bam_bc = function(inbam, outbam, bc_anno,
                          max_mis=1,
                          bam_tags = list(am="YE", ge="GE", bc="BC", mb="OX"),
                          mito="MT",
                          nthreads = 1) {

  if (!file.exists(inbam)) {
    stop("input bam file does not exists.")
  } else {
    inbam = path.expand(inbam)
  }

  outbam = path.expand(outbam)

  if (!file.exists(bc_anno)) {
    stop("barcode annotation file does not exists.")
  } else {
    bc_anno = path.expand(bc_anno)
  }
  rcpp_sc_clean_bam(inbam, outbam, bc_anno, max_mis,
                      bam_tags$am, bam_tags$ge, bam_tags$bc, bam_tags$mb,
                      mito, nthreads)
}


#' sc_gene_counting
#'
#' @description Generate gene counts matrix with UMI deduplication
#'
#' @param outdir output folder containing \code{sc_demultiplex} output
#' @param bc_anno barcode annotation comma-separated-values, first column is
#'   cell id, second column is cell barcode sequence
#' @param UMI_cor correct UMI sequencing error: 0 means no correction, 1 means
#'   simple correction and merge UMI with distance 1. 2 means merge on both UMI
#'   alignment position match.
#' @param gene_fl whether to remove low abundance genes. A gene is considered to
#'   have low abundance if only one copy of one UMI is associated with it.
#'
#' @export
#' @return no return
#' @examples
#' data_dir="celseq2_demo"
#' barcode_annotation_fn = system.file("extdata", "barcode_anno.csv",
#' package = "scPipe")
#' \dontrun{
#' # refer to the vignettes for the complete workflow
#' ...
#' sc_gene_counting(data_dir, barcode_annotation_fn)
#' ...
#' }
#'
sc_gene_counting = function(outdir, bc_anno, UMI_cor=2, gene_fl=FALSE) {
  if (!dir.exists(outdir))
    dir.create(outdir, recursive = TRUE)

  outdir = path.expand(outdir)

  dir.create(file.path(outdir, "count"), showWarnings = FALSE)
  dir.create(file.path(outdir, "stat"), showWarnings = FALSE)

  if (gene_fl) {
    i_gene_fl = 1
  } else {
    i_gene_fl = 0
  }

  if (!file.exists(bc_anno)) {
    stop("barcode annotation file does not exists.")
  } else {
    bc_anno = path.expand(bc_anno)
  }

  rcpp_sc_gene_counting(outdir, bc_anno, UMI_cor, i_gene_fl)
}



#' sc_detect_bc
#'
#' @description Detect cell barcode and generate the barcode annotation
#'
#' @param infq input fastq file, shoule be the output file of
#' \code{sc_trim_barcode}
#' @param outcsv output barcode annotation
#' @param prefix the prefix of cell name (default: `CELL_`)
#' @param bc_len the length of cell barcode, should be consistent with bl1+bl2
#' in \code{sc_trim_barcode}
#' @param max_reads the maximum of reads processed (default: 1,000,000)
#' @param min_count minimum counts to keep, barcode will be discarded if
#' it has lower count. Default value is 10. This should be set according
#' to \code{max_reads}.
#' @param number_of_cells number of cells kept in result. (default: 10000)
#' @param max_mismatch the maximum mismatch allowed. Barcodes within this
#' number will be considered as sequence error and merged. (default: 1)
#' @param white_list_file a file that list all the possible barcodes
#' each row is a barcode sequence. the list for 10x can be found at:
#' https://community.10xgenomics.com/t5/Data-Sharing/List-of-valid-cellular-barcodes/td-p/527
#' (default: NULL)
#' @export
#' @return no return
#' @examples
#' \dontrun{
#' # `sc_detect_bc`` should run before `sc_demultiplex` for
#' # Drop-seq or 10X protocols
#' sc_detect_bc("input.fastq","output.cell_index.csv",bc_len=8)
#' sc_demultiplex(...,"output.cell_index.csv")
#' }
#'
#'
sc_detect_bc = function(infq, outcsv, prefix="CELL_", bc_len,
                        max_reads=1000000, min_count=10, number_of_cells=10000,
                        max_mismatch=1,white_list_file=NULL) {
  if (!file.exists(infq)) {
    stop("input fastq file does not exists.")
  } else {
    infq = path.expand(infq)
  }
  outcsv = path.expand(outcsv)
  if(!is.null(white_list_file)){
    if(!file.exists(white_list_file)){
      stop("input whitelist file does not exists.")
    }
  } else {
    white_list_file = ""
  }
  if (max_reads=="all") {m_r = -1}
  else {m_r=max_reads}
  rcpp_sc_detect_bc(infq, outcsv, prefix, bc_len, m_r, number_of_cells,
    min_count, max_mismatch, white_list_file)
}

#' sc_demultiplex_and_count
#'
#' @description Wrapper to run \code{\link{sc_demultiplex}} and
#'   \code{\link{sc_gene_counting}} with a single command
#'
#' @inheritParams sc_demultiplex
#' @inheritParams sc_gene_counting
#'
#' @return no return
#'
#' @examples
#' \dontrun{
#' refer to the vignettes for the complete workflow, replace demultiplex and
#' count with single command:
#' ...
#' sc_demultiplex_and_count(
#'    file.path(data_dir, "out.map.bam"),
#'    data_dir,
#'    barcode_annotation_fn,
#'    has_UMI = FALSE
#' )
#' ...
#' }
#'
#' @export
#'
sc_demultiplex_and_count = function(
  inbam, outdir, bc_anno,
  max_mis = 1,
  bam_tags = list(am="YE", ge="GE", bc="BC", mb="OX"),
  mito = "MT", has_UMI = TRUE, UMI_cor = 1, gene_fl = FALSE,
  nthreads = 1
) {
  sc_demultiplex(
    inbam = inbam,
    outdir = outdir,
    bc_anno = bc_anno,
    max_mis = max_mis,
    bam_tags = bam_tags,
    mito = mito,
    has_UMI = has_UMI,
    nthreads = nthreads
  )

  sc_gene_counting(
    outdir = outdir,
    bc_anno = bc_anno,
    UMI_cor = UMI_cor,
    gene_fl = gene_fl
  )
}

#' sc_count_aligned_bam
#'
#' @description Wrapper to run \code{\link{sc_exon_mapping}},
#'   \code{\link{sc_demultiplex}} and \code{\link{sc_gene_counting}} with a
#'   single command
#'
#' @inheritParams sc_exon_mapping
#' @inheritParams sc_demultiplex
#' @inheritParams sc_gene_counting
#' @param keep_mapped_bam TRUE if feature mapped bam file should be retained.
#'
#' @return no return
#'
#' @examples
#' \dontrun{
#' sc_count_aligned_bam(
#'   inbam = "aligned.bam",
#'   outbam = "mapped.bam",
#'   annofn = c("MusMusculus-GRCm38p4-UCSC.gff3", "ERCC92_anno.gff3"),
#'   outdir = "output",
#'   bc_anno = "barcodes.csv"
#' )
#' }
#'
#' @export
#'
sc_count_aligned_bam <- function(
  inbam, outbam, annofn,
  bam_tags = list(am="YE", ge="GE", bc="BC", mb="OX"),
  bc_len = 8, UMI_len = 6,
  stnd = TRUE, fix_chr = FALSE,
  outdir,
  bc_anno,
  max_mis = 1,
  mito = "MT",
  has_UMI = TRUE, UMI_cor = 1, gene_fl = FALSE,
  keep_mapped_bam = TRUE,
  nthreads = 1
) {
  sc_exon_mapping(
    inbam = inbam,
    outbam = outbam,
    annofn = annofn,
    bam_tags = bam_tags,
    bc_len = bc_len,
    UMI_len = UMI_len,
    stnd = stnd,
    fix_chr = fix_chr,
    nthreads = nthreads
  )

  sc_demultiplex_and_count(
    inbam = outbam,
    outdir = outdir,
    bc_anno = bc_anno,
    max_mis = max_mis,
    bam_tags = bam_tags,
    mito = mito,
    has_UMI = has_UMI,
    UMI_cor = UMI_cor,
    gene_fl = gene_fl,
    nthreads = nthreads
  )

  if (!keep_mapped_bam) {
    unlink(outbam)
  }
}

Try the scPipe package in your browser

Any scripts or data that you put into this service are public.

scPipe documentation built on Nov. 8, 2020, 8:28 p.m.