R/app-cellRanger.R

Defines functions getCellRangerReference getCellRangerVDJReference getCellRangerGEXReference computeBamStatsSC createLibraryFile subsample getFastqDirs ezMethodCellRanger

Documented in getCellRangerGEXReference getCellRangerVDJReference

###################################################################
# Functional Genomics Center Zurich
# This code is distributed under the terms of the GNU General
# Public License Version 3, June 2007.
# The terms are available here: http://www.gnu.org/licenses/gpl.html
# www.fgcz.ch

ezMethodCellRanger <- function(input = NA, output = NA, param = NA) {
  sampleName <- input$getNames()
  sampleDirs <- sort(getFastqDirs(input, "RawDataDir",sampleName))
 
  #1. extract tar files if they are in tar format
  if (all(grepl("\\.tar$", sampleDirs))) {
    runDirs <- tarExtract(sampleDirs, prependUnique=TRUE)
  } else {
    stop("Require inputs to be provided in .tar files.")
  }
  
  #1.1 check validity of inputs
  runDirs <- normalizePath(runDirs)
  
  fileLevelDirs <- normalizePath(list.files(path=runDirs, full.names=TRUE))
  if (any(fs::is_file(fileLevelDirs))) {
    stop(sprintf("Fastq files need to nested inside a folder sharing the samplename. Offending samples: %s", 
                 paste(fileLevelDirs[fs::is_file(fileLevelDirs)], collapse=", ")))
  }
  
  #2. Subsample if chosen
  if (ezIsSpecified(param$nReads) && param$nReads > 0)
    fileLevelDirs <- sapply(fileLevelDirs, subsample, param)
  
  #2.1 Fix FileNames if sampleName in dataset was changed
  cwd <- getwd()
  if(any(basename(fileLevelDirs) != sampleName)) {
    for (fileLevelDir in fileLevelDirs) {
      setwd(fileLevelDir)
      cmd <- paste('rename', 
                   paste0('s/', basename(fileLevelDir),'/',sampleName, '/g'), 
                   paste0(basename(fileLevelDir),'*.gz'))
      ezSystem(cmd)
    }
    setwd(cwd)
  }
  
  fileLevelDir <- paste(fileLevelDirs, collapse = ",")
  cellRangerFolder <- str_sub(sampleName, 1, 45) %>% str_c("-cellRanger")
  
  #3.Generate the cellranger command with the required arguments
  switch(param$TenXLibrary,
         GEX = {
           #3.1. Obtain GEX the reference
           refDir <- getCellRangerGEXReference(param)
           #3.2. Command
           cmd <- paste(
             "cellranger count", paste0("--id=", cellRangerFolder),
             paste0("--transcriptome=", refDir),
             paste0("--fastqs=", fileLevelDir),
             paste0("--sample=", sampleName),
             paste0("--localmem=", param$ram),
             paste0("--localcores=", param$cores),
             paste0("--chemistry=", param$chemistry),
             if(grepl('^8', basename(param$CellRangerVersion))){paste0("--create-bam true")},
             if (ezIsSpecified(param$expectedCells)) {paste0("--expect-cells=", param$expectedCells)},
             ifelse(ezIsSpecified(param$includeIntrons) && param$includeIntrons, "--include-introns=true", "--include-introns=false")
           )
         },
         VDJ = {
           #3.1. Obtain the VDJ reference
           refDir <- getCellRangerVDJReference(param)
           #3.2. Command
           cmd <- paste(
             "cellranger vdj", paste0("--id=", cellRangerFolder),
             paste0("--reference=", refDir),
             paste0("--fastqs=", fileLevelDir),
             paste0("--sample=", sampleName),
             paste0("--localmem=", param$ram),
             paste0("--localcores=", param$cores)
           )
         },
         FeatureBarcoding = {
           #3.1. Obtain GEX the reference
           refDir <- getCellRangerGEXReference(param)
           
           #3.2. Locate the Feature sample
           featureDirs <- getFastqDirs(input, "FeatureDataDir", sampleName)
           featureName <- gsub(".tar", "", basename(featureDirs))
           
           #3.3. Locate the Feature info csv file
           featureRefFn <- file.path(
             dirname(featureDirs),
             str_c(sampleName, "feature_ref.csv", sep = "_")
           )
           stopifnot(any(file.exists(featureRefFn)))
           featureRefFn <- head(featureRefFn[file.exists(featureRefFn)], 1)
           
           #3.4. Decompress the sample that contains the antibodies reads if they are in tar format
           if (all(grepl("\\.tar$", featureDirs)))
             featureDirs <- tarExtract(featureDirs)
           
           featureDirs <- normalizePath(featureDirs)
           
           #3.5. Create library file that contains the sample and feature dirs location
           libraryFn <- createLibraryFile(fileLevelDirs, featureDirs, sampleName, featureName)
           
           #3.6. Command
           cmd <- paste(
             "cellranger count", paste0("--id=", cellRangerFolder),
             paste0("--transcriptome=", refDir),
             paste0("--libraries=", libraryFn),
             paste0("--feature-ref=", featureRefFn),
             paste0("--localmem=", param$ram),
             paste0("--localcores=", param$cores),
             paste0("--chemistry=", param$chemistry),
             if (ezIsSpecified(param$expectedCells)) {paste0("--expect-cells=", param$expectedCells)},
             ifelse(ezIsSpecified(param$includeIntrons) && param$includeIntrons, "--include-introns=true", "--include-introns=false")
           )
         })
  
  #4. Add additional cellranger options if specified
  if (ezIsSpecified(param$cmdOptions)) {
    cmd <- paste(cmd, param$cmdOptions)
  }
  
  #5. Execute the command
  ezSystem(cmd)
  
  #6. Optional run of VeloCyto
  if(param$runVeloCyto){
      gtfFile <- param$ezRef["refFeatureFile"]
      cmd <- paste('velocyto run10x', cellRangerFolder, gtfFile, '-@', param$cores)
      ezSystem(cmd)
      ezSystem(paste('mv', file.path(cellRangerFolder,'velocyto'),  file.path(cellRangerFolder,'outs')))
  }
  
  #7. Delete temp files and rename the final cellranger output folder
  unlink(dirname(runDirs), recursive = TRUE)
  if (exists("featureDirs")){
    unlink(basename(featureDirs))
  }
  file.rename(file.path(cellRangerFolder, "outs"), sampleName)
  unlink(cellRangerFolder, recursive = TRUE)
  if (ezIsSpecified(param$controlSeqs)) 
    unlink(refDir, recursive = TRUE)
  
  #8. Calculate alignment stats from the BAM file for GEX
  if(param$bamStats && param$TenXLibrary == "GEX"){
    genomeBam <- file.path(sampleName, "possorted_genome_bam.bam")
    if (file.exists(genomeBam)){
      alignStats <- computeBamStatsSC(genomeBam, ram=param$ram)
      if (!is.null(alignStats)){
        ezWrite.table(alignStats, file=file.path(sampleName, "CellAlignStats.txt"), head="Barcode")
      }
    }
  }

  # for GEX libraries
  if(!param$keepBam && param$TenXLibrary == "GEX"){
      bamFile <- file.path(sampleName, "possorted_genome_bam.bam")
      if(file.exists(bamFile)){
        ezSystem(paste('rm', bamFile))
        ezSystem(paste('rm', paste0(bamFile,'.bai')))
      }
  }
  
  return("Success")
}

getFastqDirs <- function(input, column, sampleName) {
  fastqDirs <- strsplit(input$getColumn(column), ",")[[sampleName]]
  fastqDirs <- file.path(input$dataRoot, fastqDirs)
  return(fastqDirs)
}

subsample <- function(targetDir, param){
  subDir = paste0(targetDir, "-sub")
  dir.create(subDir)
  fqFiles = list.files(targetDir, pattern = ".fastq.gz", full.names = TRUE, recursive = TRUE)
  stopifnot(length(fqFiles) <= 4) ## subsample commands below do only work if reads are not split in per-lane files
  for (fq in fqFiles){
    fqSub = file.path(subDir, basename(fq))
    cmd = paste("seqtk sample -s 42 -2", fq, param$nReads, "| pigz --fast -p1 >", fqSub)
    ezSystem(cmd)
  }
  return(subDir)
}

createLibraryFile <- function(sampleDirs, featureDirs, sampleName, featureName) {
  libraryFn <- tempfile(pattern = "library", tmpdir = ".", fileext = ".csv")
  libraryTb <- tibble(
    fastqs = c(sampleDirs, featureDirs),
    sample = c(
      rep(sampleName, length(sampleDirs)),
      featureName
    ),
    library_type = c(
      rep("Gene Expression", length(sampleDirs)),
      rep("Antibody Capture", length(featureDirs))
    )
  )
  write_csv(libraryTb, libraryFn)
  return(libraryFn)
}

computeBamStatsSC = function(bamFile, ram=NULL) {
  ## compute stats per cell from the bam file
  if (!is.null(ram)){  
    nAlign = sum(ezScanBam(bamFile, tag = "CB", 
                           what = character(0), isUnmappedQuery = FALSE, countOnly = TRUE)$records)
    if (nAlign / ram > 20e6){
      message("computeBamStatsSC: not executed - would take too much RAM")
      return(NULL)
    }
  }
  cb = ezScanBam(bamFile, tag = "CB", 
                 what = character(0), isUnmappedQuery = FALSE)$tag$CB
  nReads = table(cb)
  resultFrame = data.frame(nRead=as.vector(nReads), row.names=names(nReads))
  x = ezScanBam(bamFile, tag = "UB", 
                what = character(0), isUnmappedQuery = FALSE)$tag$UB
  resultFrame$nUmi = as.vector(tapply(x, cb, n_distinct))
  x = ezScanBam(bamFile, tag = "ts", 
                what = character(0), isUnmappedQuery = FALSE)$tag$ts
  if (length(x) == length(cb)){ ## the 5' protocol does not have the ts tag
    resultFrame$nTso = as.vector(tapply(x > 3, cb, sum, na.rm=TRUE)) ## at least 3 bases
  }
  x = ezScanBam(bamFile, tag = "pa", 
                what = character(0), isUnmappedQuery = FALSE)$tag$pa
  if (length(x) == length(cb)){ ## the 5' protocol does not have the ts tag
    resultFrame$nPa = as.vector(tapply(x > 3, cb, sum, na.rm=TRUE))
  }
  x = ezScanBam(bamFile, tag = "RE", 
                what = character(0), isUnmappedQuery = FALSE)$tag$RE
  resultFrame$nIntergenic = as.vector(tapply(x == "I", cb, sum))
  resultFrame$nExonic = as.vector(tapply(x == "E", cb, sum))
  resultFrame$nIntronic = as.vector(tapply(x == "N", cb, sum))
  return(resultFrame)
}


getCellRangerGEXReference <- function(param) {
  require(rtracklayer)
  cwd <- getwd()
  on.exit(setwd(cwd), add = TRUE)
  
  if (ezIsSpecified(param$controlSeqs) | ezIsSpecified(param$secondRef)) {
    refDir <- file.path(getwd(), "10X_customised_Ref")
  } else {
    if (ezIsSpecified(param$transcriptTypes)) {
      cellRangerBase <- paste(sort(param$transcriptTypes), collapse = "-")
      ## This is a combination of transcript types to use.
    } else {
      cellRangerBase <- ""
    }
    refDir <- sub(
      "\\.gtf$", paste0("_10XGEX_SC_", cellRangerBase, "_Index"),
      param$ezRef["refFeatureFile"]
    )
  }
  
  lockFile <- paste0(refDir, ".lock")
  i <- 0
  while (file.exists(lockFile) && i < INDEX_BUILD_TIMEOUT) {
    ### somebody else builds and we wait
    Sys.sleep(60)
    i <- i + 1
  }
  if (file.exists(lockFile)) {
    stop(paste(
      "reference building still in progress after",
      INDEX_BUILD_TIMEOUT, "min"
    ))
  }
  ## there is no lock file
  if (file.exists(refDir)) {
    ## we assume the index is built and complete
    return(refDir)
  }
  
  ## we have to build the reference
  setwd(dirname(refDir))
  ezWrite(Sys.info(), con = lockFile)
  on.exit(file.remove(lockFile), add = TRUE)
  
  job <- ezJobStart("10X CellRanger build")
  
  if (ezIsSpecified(param$controlSeqs)) {
    ## make reference genome
    genomeLocalFn <- tempfile(
      pattern = "genome", tmpdir = getwd(),
      fileext = ".fa"
    )
    file.copy(from = param$ezRef@refFastaFile, to = genomeLocalFn)
    writeXStringSet(getControlSeqs(param$controlSeqs),
                    filepath = genomeLocalFn,
                    append = TRUE
    )
    on.exit(file.remove(genomeLocalFn), add = TRUE)
  } else if(ezIsSpecified(param$secondRef)){
      ## make reference genome
      genomeLocalFn <- tempfile(
          pattern = "genome", tmpdir = getwd(),
          fileext = ".fa"
      )
      file.copy(from = param$ezRef@refFastaFile, to = genomeLocalFn)
      secondaryRef  <- readDNAStringSet(param$secondRef)
      writeXStringSet(secondaryRef,
                      filepath = genomeLocalFn,
                      append = TRUE
      )
      on.exit(file.remove(genomeLocalFn), add = TRUE) 
  } else {
    genomeLocalFn <- param$ezRef@refFastaFile
  }
  
  ## make gtf
  gtfFile <- tempfile(
    pattern = "genes", tmpdir = getwd(),
    fileext = ".gtf"
  )
  if (ezIsSpecified(param$transcriptTypes)) {
    export.gff2(gtfByTxTypes(param, param$transcriptTypes),
                con = gtfFile
    )
  } else {
    file.copy(from = param$ezRef@refFeatureFile, to = gtfFile)
  }
  if (ezIsSpecified(param$controlSeqs)|ezIsSpecified(param$secondRef)) {
    extraGR <- makeExtraControlSeqGR(param)
    gtfExtraFn <- tempfile(
      pattern = "extraSeqs", tmpdir = getwd(),
      fileext = ".gtf"
    )
    on.exit(file.remove(gtfExtraFn), add = TRUE)
    export.gff2(extraGR, con = gtfExtraFn)
    ezSystem(paste("cat", gtfExtraFn, ">>", gtfFile))
  }
  
  cmd <- paste(
    "cellranger mkref",
    paste0("--genome=", basename(refDir)),
    paste0("--fasta=", genomeLocalFn),
    paste0("--genes=", gtfFile),
    paste0("--nthreads=", param$cores)
  )
  ezSystem(cmd)
  
  file.remove(gtfFile)
  
  return(refDir)
}

getCellRangerVDJReference <- function(param) {
  require(rtracklayer)
  cwd <- getwd()
  on.exit(setwd(cwd), add = TRUE)
  
  refDir <- sub(
    "\\.gtf$", "_10XVDJ_Index",
    param$ezRef["refFeatureFile"]
  )
  
  lockFile <- paste0(refDir, ".lock")
  i <- 0
  while (file.exists(lockFile) && i < INDEX_BUILD_TIMEOUT) {
    ### somebody else builds and we wait
    Sys.sleep(60)
    i <- i + 1
  }
  if (file.exists(lockFile)) {
    stop(paste(
      "reference building still in progress after",
      INDEX_BUILD_TIMEOUT, "min"
    ))
  }
  ## there is no lock file
  if (file.exists(refDir)) {
    ## we assume the index is built and complete
    return(refDir)
  }
  
  ## we have to build the reference
  setwd(dirname(refDir))
  ezWrite(Sys.info(), con = lockFile)
  on.exit(file.remove(lockFile), add = TRUE)
  
  job <- ezJobStart("10X CellRanger build")
  
  cmd <- paste(
    "cellranger mkvdjref",
    paste0("--genome=", basename(refDir)),
    paste0("--fasta=", param$ezRef@refFastaFile),
    paste0("--genes=", param$ezRef@refFeatureFile)
  )
  ezSystem(cmd)
  
  return(refDir)
}

## not used any more
getCellRangerReference <- function(param) {
  if (ezIsSpecified(param$controlSeqs)) {
    if (param$TenXLibrary == "VDJ") {
      stop("VDJ library with extra control sequences is not implemented yet!")
    }
    require(rtracklayer)
    ## make reference genome
    genomeLocalFn <- tempfile(
      pattern = "genome", tmpdir = getwd(),
      fileext = ".fa"
    )
    file.copy(from = param$ezRef@refFastaFile, to = genomeLocalFn)
    writeXStringSet(getControlSeqs(param$controlSeqs),
                    filepath = genomeLocalFn,
                    append = TRUE
    )
    on.exit(file.remove(genomeLocalFn), add = TRUE)
    
    ## make gtf
    gtfFile <- tempfile(
      pattern = "genes", tmpdir = getwd(),
      fileext = ".gtf"
    )
    on.exit(file.remove(gtfFile), add = TRUE)
    
    file.copy(from = param$ezRef@refFeatureFile, to = gtfFile)
    extraGR <- makeExtraControlSeqGR(param$controlSeqs)
    gtfExtraFn <- tempfile(
      pattern = "extraSeqs", tmpdir = getwd(),
      fileext = ".gtf"
    )
    on.exit(file.remove(gtfExtraFn), add = TRUE)
    export.gff2(extraGR, con = gtfExtraFn)
    ezSystem(paste("cat", gtfExtraFn, ">>", gtfFile))
    
    ## build the index
    refDir <- file.path(getwd(), "10X_customised_Ref")
    cmd <- paste(
      "cellranger mkref",
      paste0("--genome=", basename(refDir)),
      paste0("--fasta=", genomeLocalFn),
      paste0("--genes=", gtfFile),
      paste0("--nthreads=", param$cores)
    )
    ezSystem(cmd)
  } else {
    ## TODO: automate the reference building
    refDir <- dirname(param$ezRef["refFeatureFile"])
    if (param$TenXLibrary == "VDJ") {
      refDirs <- list.files(
        path = refDir, pattern = "^10X_Ref.*_VDJ_",
        full.names = TRUE
      )
    } else if (param$TenXLibrary == "GEX") {
      refDirs <- list.files(
        path = refDir, pattern = "^10X_Ref.*_GEX_",
        full.names = TRUE
      )
    } else {
      stop("Unsupported 10X library: ", param$TenXLibrary)
    }
    
    if (length(refDirs) == 0) {
      stop("No 10X_Ref folder found in", refDir)
    }
    if (length(refDirs) > 1) {
      warning("Multiple 10X_Ref folders in ", refDir)
    }
    refDir <- refDirs[1]
  }
  return(refDir)
}

##' @author Opitz, Lennart
##' @template app-template
##' @templateVar method ezMethodCellRanger(input=NA, output=NA, param=NA)
##' @description Use this reference class to run
EzAppCellRanger <-
  setRefClass("EzAppCellRanger",
              contains = "EzApp",
              methods = list(
                initialize = function() {
                  "Initializes the application using its specific defaults."
                  runMethod <<- ezMethodCellRanger
                  name <<- "EzAppCellRanger"
                  appDefaults <<- rbind(
                    TenXLibrary = ezFrame(
                      Type = "charVector",
                      DefaultValue = "GEX",
                      Description = "Which 10X library? GEX or VDJ."
                    ),
                    chemistry = ezFrame(
                      Type = "character",
                      DefaultValue = "auto",
                      Description = "Assay configuration."
                    ),
                    expectedCells = ezFrame(
                      Type = "numeric",
                      DefaultValue = 10000,
                      Description = "Expected number of cells."
                    ),
                    includeIntrons = ezFrame(
                      Type = "logical",
                      DefaultValue = TRUE,
                      Description = "Count reads on introns."
                    ),
                    controlSeqs = ezFrame(
                      Type = "charVector",
                      DefaultValue = "",
                      Description = "control sequences to add"
                    ),
                    bamStats = ezFrame(
                      Type = "logical",
                      DefaultValue = TRUE,
                      Description = "compute per cell alignment stats"
                    ),
                    runVeloCyto = ezFrame(
                        Type = "logical",
                        DefaultValue = FALSE,
                        Description = "run velocyto and generate loom file"
                    ),
                    keepBam = ezFrame(
                        Type = "logical",
                        DefaultValue = FALSE,
                        Description = "keep bam file produced by CellRanger"
                    )
                  )
                }
              )
  )
uzh/ezRun documentation built on April 24, 2024, 4:01 p.m.