inst/doc/CaseStudy-RNAseqQuantification.R

## ----experimentPrep, eval = FALSE---------------------------------------------
#  library(BiocParallel)
#  dir.create("fastq", showWarnings = FALSE)
#  
#  extractSRA <- function(sra_accession, exe_path = 'fastq-dump',
#                         args = '--split-3 --gzip', outdir = 'fastq',
#                         dry_run = FALSE) {
#      cmdline = sprintf('%s %s --outdir %s %s',
#                        exe_path, args, outdir, sra_accession)
#      if (dry_run) {
#          message("will run with this command line:\n", cmdline)
#      } else {
#          return(system(cmdline))
#      }
#  }
#  
#  samples <- c("SRR5273705", "SRR5273689", "SRR5273699", "SRR5273683")
#  
#  bplapply(samples, extractSRA, BPPARAM = MulticoreParam(4))
#  
#  annotation <- data.frame(samples,
#                           tissue = c("brain", "brain", "heart", "heart"))

## ----downloadReference, eval = FALSE------------------------------------------
#  dir.create("reference/raw", recursive = TRUE, showWarnings = FALSE)
#  download.file("ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M16/gencode.vM16.transcripts.fa.gz",
#                destfile = "reference/raw/transcriptome.fa.gz")

## ----indexBuild, eval = FALSE-------------------------------------------------
#  dir.create("reference/index", showWarnings = FALSE)
#  
#  system("kallisto index -i reference/index/kallistoIdx.idx reference/raw/transcriptome.fa.gz")
#  system("salmon index -t reference/raw/transcriptome.fa.gz -i reference/index/salmon_index")
#  system("gunzip -c reference/raw/transcriptome.fa.gz > reference/raw/transcriptome.fa && sailfish index -t reference/raw/transcriptome.fa -o reference/index/sailfish_index")
#  
#  library(Biostrings)
#  
#  dnSt <- names(readDNAStringSet("reference/raw/transcriptome.fa.gz"))
#  dnSt <- sapply(strsplit(dnSt, "\\||\\."), "[[", 1)

## ----versions, eval = FALSE---------------------------------------------------
#  library(SummarizedBenchmark)
#  
#  getKallistoVersion <- function() {
#      vers <-
#          suppressWarnings(system("kallisto", intern = TRUE)[1])
#      strsplit(vers, " ")[[1]][2]
#  }
#  
#  getSalmonVersion <- function() {
#      vers <-
#          suppressWarnings(system("salmon --version 2>&1", intern = TRUE)[1])
#      strsplit(vers, " ")[[1]][2]
#  }
#  
#  getSailfishVersion <- function() {
#      vers <-
#          suppressWarnings(system("sailfish --version 2>&1", intern = TRUE)[1])
#      strsplit(vers, " ")[[1]][3]
#  }

## ---- eval = FALSE------------------------------------------------------------
#  dir.create("out/kallisto", showWarnings = FALSE)
#  dir.create("out/salmon", showWarnings = FALSE)
#  dir.create("out/sailfish", showWarnings = FALSE)
#  
#  runKallisto <- function(sample, args = "") {
#      fastqFile1 <- sprintf( "fastq/%s_1.fastq.gz", sample)
#      fastqFile2 <- gsub( "_1", "_2", fastqFile1)
#      output <- sprintf("out/kallisto/%s.out", sample)
#      cmd <- sprintf("kallisto quant -i reference/index/kallistoIdx.idx -o %s %s %s %s",
#                     output, args, fastqFile1, fastqFile2)
#      system(cmd)
#      require(tximport)
#      ab <- tximport(file.path(output, "abundance.h5"),
#                   type = "kallisto", txOut = TRUE)
#      counts <- ab$counts[, 1]
#      names(counts) <- sapply(strsplit(names(counts), "\\||\\."), "[[", 1)
#      counts
#  }
#  
#  runSalmon <- function(sample, args = "-l A -p 4") {
#      fastqFile1 <- sprintf("fastq/%s_1.fastq.gz", sample)
#      fastqFile2 <- gsub( "_1", "_2", fastqFile1)
#      output <- sprintf("out/salmon/%s.out", sample)
#      cmd <- sprintf("salmon quant -i reference/index/salmon_index %s -o %s -1 %s -2 %s",
#                     args, output, fastqFile1, fastqFile2)
#      system(cmd)
#      require(tximport)
#      counts <- tximport(file.path(output, "quant.sf"),
#                         type = "salmon", txOut = TRUE)$counts[, 1]
#      names(counts) <- sapply(strsplit(names(counts), "\\||\\."), "[[", 1)
#      counts
#  }
#  
#  runSailfish <- function(sample, args = "-l IU") {
#      fastqFile1 <- sprintf( "fastq/%s_1.fastq.gz", sample)
#      fastqFile2 <- gsub( "_1", "_2", fastqFile1)
#      output <- sprintf("out/sailfish/%s.out", sample)
#      cmd <- sprintf("echo \"sailfish quant -i reference/index/sailfish_index %s -o %s -1 <(zcat %s) -2 <(zcat %s)\" | bash",
#                     args, output, fastqFile1, fastqFile2)
#      cat(cmd)
#      system(cmd)
#      counts <- tximport(file.path(output, "quant.sf"),
#                         type = "sailfish", txOut = TRUE)$counts[, 1]
#      names(counts) <- sapply(strsplit(names(counts), "\\||\\."), "[[", 1)
#      counts
#  }

## ---- eval=FALSE--------------------------------------------------------------
#  library(SummarizedBenchmark)
#  library(tximport)
#  
#  b <- BenchDesign() %>%
#      addMethod(label = "kallisto-default",
#                func = runKallisto,
#                params = rlang::quos(sample = sample,
#                                     args = "-t 16"),
#                meta = list(pkg_vers = rlang::quo(getKallistoVersion()))
#                ) %>%
#      addMethod(label = "kallisto-bias",
#               func = runKallisto,
#               params = rlang::quos(sample = sample,
#                                    args = "--bias -t 16"),
#               meta = list(pkg_vers = rlang::quo(getKallistoVersion()))
#               ) %>%
#     addMethod(label = "salmon-default",
#               func = runSalmon,
#               params = rlang::quos(sample = sample,
#                                    args = "-l IU -p 16"),
#               meta = list(pkg_vers = rlang::quo(getSalmonVersion()))
#               ) %>%
#     addMethod(label = "salmon-gcBias",
#               func = runSalmon,
#               params = rlang::quos(sample=sample,
#                                    args="-l IU --gcBias -p 16"),
#               meta = list(pkg_vers = rlang::quo(getSalmonVersion()))
#               ) %>%
#     addMethod(label = "sailfish-default",
#               func = runSailfish,
#               params = rlang::quos(sample=sample,
#                                    args="-l IU -p 16"),
#               meta = list(pkg_vers = rlang::quo(getSailfishVersion()))
#               )
#  
#  printMethods(b)

## ----runBenchmark, eval=FALSE-------------------------------------------------
#  dat <- list(txIDs = dnSt)
#  allSB <- lapply(samples, function(sample) {
#      dat[["sample"]] <- sample
#      sb <- buildBench(b, data = dat, sortIDs = "txIDs",
#                       catchErrors = FALSE, parallel = TRUE,
#                       BPPARAM = SerialParam())
#      colData( sb )$sample <- sample
#      colData( sb )$tissue <- as.character(annotation$tissue[annotation$sample == sample])
#      sb
#  })

## ----subsample, eval=FALSE----------------------------------------------------
#  keepRows <- sapply(allSB, function(x) { rowSums(is.na(assay(x))) })
#  keepRows <- rowSums(keepRows) == 0
#  allSB <- lapply(allSB, function(x) { x[keepRows, ] })
#  
#  set.seed(12)
#  keepRows <- sample(seq_len(nrow(allSB[[1]])), 50000)
#  
#  allSB <- lapply(allSB, function(x) { x[keepRows, ] })
#  
#  #save(allSB, file = "../data/quantSB.rda",
#  #     compress = "xz", compression_level = 9)

## ----loadSB, message=FALSE----------------------------------------------------
library(SummarizedBenchmark)
data("quantSB")

## ----metadata-----------------------------------------------------------------
colData(allSB[[1]])[, c("param.args", "meta.version", "sample", "tissue")]

## ----pcaPlot------------------------------------------------------------------
keep <- !rowSums( is.na( assays( allSB[[1]] )[["default"]] ) ) > 0

pcaRes <- 
    prcomp( log10( t( assays( allSB[[1]] )[["default"]][keep,] ) + 1 ) )
varExp <- round( 100*(pcaRes$sdev/sum( pcaRes$sdev )), 2)

tidyData <- data.frame( 
    PC1=pcaRes$x[,"PC1"], 
    PC2=pcaRes$x[,"PC2"], 
    sample=colData( allSB[[1]] )$sample, 
    label=gsub("\\d+$", "", rownames( colData(allSB[[1]]) ) ) )

tidyData <- tidyData %>%
    dplyr::mutate( 
               method=gsub( "-.*$", "", label) )


tidyData %>%
    ggplot( aes( PC1, PC2, colour=label ) ) +
    geom_point() + coord_fixed() +
    ylab(sprintf( "PC2 (%0.2f %%)", varExp[2]) ) +
    xlab(sprintf( "PC1 (%0.2f %%)", varExp[1]) ) +
    theme(legend.pos="top") +
    guides(col = guide_legend(nrow = 5), 
           shape = guide_legend(nrow = 4))

## ----rnaseqcomp, message=FALSE, eval=FALSE------------------------------------
#  library(rnaseqcomp)
#  library(biomaRt)
#  data(simdata)
#  houseHuman <- simdata$meta$gene[simdata$meta$house]
#  houseHuman <- gsub("\\.\\d+", "", houseHuman )
#  
#  mart <- useMart( "ensembl", "mmusculus_gene_ensembl" )
#  geneMap <- getBM( c( "ensembl_transcript_id",
#                      "hsapiens_homolog_ensembl_gene",
#                      "hsapiens_homolog_orthology_type" ),
#                   mart = mart)
#  geneMap <-
#      geneMap[geneMap$`hsapiens_homolog_orthology_type` == "ortholog_one2one",]
#  houseMouse <-
#      geneMap$ensembl_transcript_id[geneMap$hsapiens_homolog_ensembl_gene %in%
#                                    houseHuman]
#  
#  allSB <- do.call( cbind, allSB )
#  colData( allSB )$label <- gsub("\\d+$", "", rownames( colData( allSB ) ) )
#  
#  condInfo <-
#      colData( allSB )[colData( allSB )$label == "kallisto-default","tissue"]
#  
#  replicateInfo <- condInfo
#  evaluationFeature <- rep( TRUE, length.out = nrow( allSB ) )
#  
#  calibrationFeature <- rownames(allSB) %in% houseMouse
#  
#  unitReference <- 1
#  quantificationList <- lapply(
#      split( seq_along( colnames( allSB ) ), colData( allSB )$label ),
#      function(x) { assay( allSB )[,x] })
#  
#  compObject <- signalCalibrate(
#      quantificationList,
#      factor(condInfo), factor(replicateInfo),
#      evaluationFeature, calibrationFeature, unitReference,
#      calibrationFeature2 = calibrationFeature)
#  
#  compObject

Try the SummarizedBenchmark package in your browser

Any scripts or data that you put into this service are public.

SummarizedBenchmark documentation built on Nov. 8, 2020, 8:30 p.m.