In wevanjohnson/animalcules.preprocess: This package contains tools and functions for preprocessing microbiome data

knitr::opts_chunk$set(echo = TRUE)

library(shiny)
library(shinythemes)
library(Biostrings)
library(tools)
library(Rsubread)
library(Rsamtools)
library(taxize)
library(rsconnect)

#' Helper function for demultiplexing
#'
#' Helper function for demultiplexing sequencing reads, designed in a way to allow for parallelization accross barcodes (parallel extraction of reads by barcode). This function takes a specific barcode (numeric index) from lists of sample names/barcodes, a Biostrings::DNAStringSet of barcodes by sequence header, and a Biostrings::QualityScaledXStringSet of reads corresponding to the barcodes. Based on the barcode index given, it extracts all reads for the indexed barcode and writes all the reads from that barcode to a separate .fastq file. 
#' @param barcodeIndex Which barcode (integer number or index) in the barcodes or sample name to use for read extraction 
#' @param barcodes A list of all barcodes in the sequencing dataset. Correlates and in same order as sampleNames
#' @param sampleNames A list of sample names or identifiers associated with each barcode in the barcodes list
#' @param index A Biostrings::DNAStringSet that contains the read headers and barcode sequence for each header in the sequence slot
#' @param reads A Biostrings::QualityScaledXStringSet that has the same headers and order as the index file, but contains the read sequences and their quality scores
#' @param location A directory location to store the demuliplexed read files. Defaults to generate a new subdirectory at './demultiplex_fastq'
#' @param rcBarcodes Should the barcode indexes in the barcodes list be reverse complemented to match the sequences in the index DNAStringSet? Defaults to TRUE
#' @param hDist Uses a Hamming Distance or number of base differences to allow for inexact matches for the barcodes/indexes. Defaults to 0. Warning: if the Hamming Distance is >=1 and this leads to inexact index matches to more than one barcode, that read will be written to more than one demultiplexed read files 
#' 
#' @return Writes a single .fastq file that contains all reads whose index matches the barcode specified. This file will be written to the location directory, and will be named based on the specified sampleName and barcode, e.g. './demultiplex_fastq/SampleName1_GGAATTATCGGT.fastq.gz' 
#'
#' @examples
#' ## Load example barcode, index, and read data into R session:
#' barcodePath <- system.file("extdata", "barcodes.txt", package = "animalcules.preprocess")
#' bcFile <- read.table(barcodePath, sep = "\t", header = T)
#' 
#' indexPath <- system.file("extdata", "virus_example_index.fastq", package = "animalcules.preprocess")
#' inds <- Biostrings::readDNAStringSet(indexPath, format = "fastq")
#' 
#' readPath <- system.file("extdata", "virus_example.fastq", package = "animalcules.preprocess")
#' reads <- Biostrings::readQualityScaledDNAStringSet(readPath)
#' 
#' ## Extract reads from the first barcode
#' results <- extractReads(1, bcFile[, 2], bcFile[, 1], inds, reads, rcBarcodes = FALSE, 
#'     location = ".")
#' results
#' 
#' ## Extract reads from multiple barcodes
#' more_results <- lapply(1:6, extractReads, bcFile[, 2], bcFile[, 1], inds, 
#'     reads, rcBarcodes = FALSE, location = ".")
#'                        
#' ## BiocParallel application
#' multicoreParam <- BiocParallel::MulticoreParam(workers = 3)
#' parallel_results <- BiocParallel::bplapply(1:6, extractReads, bcFile[, 
#'     2], bcFile[, 1], inds, reads, rcBarcodes = FALSE, location = ".", BPPARAM = multicoreParam)
#' @export
#' 
extractReads <- function(barcodeIndex, barcodes, sampleNames, index, reads, 
                         location = "./demultiplex_fastq", rcBarcodes = TRUE, hDist = 0) {
  barcode <- barcodes[barcodeIndex]
  sampleName <- sampleNames[barcodeIndex]
  message("Finding reads for barcode: ", barcode)
  if (rcBarcodes) {
    rci <- as.character(Biostrings::reverseComplement(Biostrings::DNAString(barcode)))
  } else {
    rci <- barcode
  }
  # ind_match <- as.character(index) == rci
  ind_match <- adist(as.character(index), rci) <= hDist

  numReads <- sum(ind_match)
  outFileName <- paste(location, "/", sampleName, "_", barcode, ".fastq.gz", 
                       sep = "")
  if (numReads == 0) {
    message("\tFound 0 reads for this barcode, no file will be written")
  } else {
    message("\tFound ", sum(ind_match), " reads, writing reads to: ", 
            outFileName)
    Biostrings::writeQualityScaledXStringSet(reads[c(ind_match)], outFileName, 
                                             compress = T)
  }
  return(list(output_file = outFileName, numberOfReads = numReads, matchedIndexes = ind_match))
}


#' Demultiplexing sequencing reads
#'
#' Function for demultiplexing sequencing reads arranged in a common format provided by sequencers (such as Illumina) generally for 16S data. This function takes a matrix of sample names/barcodes, a .fastq file of barcodes by sequence header, and a .fastq file of reads corresponding to the barcodes. Based on the barcodes given, the function extracts all reads for the indexed barcode and writes all the reads from that barcode to separate .fastq files. 
#' @param barcodeFile File name for a file containing a .tsv matrix with a header row, and then sample names (column 1) and barcodes (column 2).
#' @param indexFile Location to a .fastq file that contains the barcodes for each read. The headers should be the same (and in the same order) as the readFile, and the sequence in the indexFile should be the corresponding barcode for each read. Quality scores are not considered
#' @param readFile Location to the sequencing read .fastq file that corresponds to the indexFile
#' @param rcBarcodes Should the barcode indexes in the barcodeFile be reverse complemented to match the sequences in the indexFile? Defaults to TRUE
#' @param location A directory location to store the demuliplexed read files. Defaults to generate a new subdirectory at './demultiplex_fastq'
#' @param cores The number of cores to use for parallelization (BiocParallel). This function will parallelize over the barcodes and extract reads for each barcode separately and write them to separate demultiplexed files
#' @param hammingDist Uses a Hamming Distance or number of base differences to allow for inexact matches for the barcodes/indexes. Defaults to 0. Warning: if the Hamming Distance is >=1 and this leads to inexact index matches to more than one barcode, that read will be written to more than one demultiplexed read files 
#' 
#' @return Returns multiple .fastq files that contain all reads whose index matches the barcodes given. These files will be written to the location directory, and will be named based on the given sampleNames and barcodes, e.g. './demultiplex_fastq/SampleName1_GGAATTATCGGT.fastq.gz' 
#'
#' @examples
#' ## Get barcode, index, and read data locations
#' barcodePath <- system.file("extdata", "barcodes.txt", package = "animalcules.preprocess")
#' indexPath <- system.file("extdata", "virus_example_index.fastq", package = "animalcules.preprocess")
#' readPath <- system.file("extdata", "virus_example.fastq", package = "animalcules.preprocess")
#' 
#' ## Get barcode, index, and read data locations
#' demult <- demultiplex(barcodePath, indexPath, readPath, rcBarcodes = FALSE, 
#'     hammingDist = 2)
#' demult
#'
#' @export

demultiplex <- function(barcodeFile, indexFile, readFile, rcBarcodes = TRUE, 
                        location = "./demultiplex_fastq", cores = 1, hammingDist = 0) {
  message("Reading Sample Names and Barcodes from: ", barcodeFile)
  bcFile <- read.table(barcodeFile, sep = "\t", header = T)
  barcodes <- bcFile[, 2]
  samNames <- bcFile[, 1]
  message("\tFound information for ", length(barcodes), " samples/barcodes")

  message("Reading Index File: ", indexFile)
  inds <- Biostrings::readDNAStringSet(indexFile, format = "fastq")
  message("\tFound indexes for ", length(inds), " reads")

  message("Reading Sequence File: ", readFile)
  reads <- Biostrings::readQualityScaledDNAStringSet(readFile)
  message("\tFound ", length(reads), " reads")

  ## make output directory if nessary
  if (!dir.exists(location)) {
    dir.create(location)
  }

  # Loop over barcodes
  numReads <- NULL
  ind_no_match <- numeric(length(reads))
  for (i in 1:length(barcodes)) {
    extracted <- extractReads(i, barcodes, samNames, inds, reads, rcBarcodes = rcBarcodes, 
                              location = location, hDist = hammingDist)
    numReads <- c(numReads, extracted$numberOfReads)
    ind_no_match <- ind_no_match + extracted$matchedIndexes
  }
  message(sum(ind_no_match > 1))
  ind_no_match <- (ind_no_match == 0)

  # sapply over barcodes and writing to file -- for multi-threading if
  # (cores == 1){ extracted <- lapply(1:length(barcodes), extractReads,
  # barcodes, samNames, inds, reads, hammingDist) }else{ message('Using
  # ',cores,' cores') multicoreParam <- MulticoreParam(workers = cores)
  # extracted <- bplapply(1:length(barcodes),extractReads, barcodes,
  # samNames, inds, reads, BPPARAM = multicoreParam) } numReads <-
  # sapply(extracted, function(x) x$numberOfReads) ind_no_match <- (
  # rowSums(sapply(extracted, function(x) x$matchedIndexes)) == 0 )

  # number of reads for each barcode
  if (any(numReads == 0)) {
    message("Did not find any reads for the following barcodes: ", 
            paste(barcodes[numReads == 0], collapse = " "))
    message("Did not find any reads for the following samples: ", paste(samNames[numReads == 
                                                                                   0], collapse = " "))
    write(paste("Did not find any reads for the following barcodes:", 
                paste(barcodes[numReads == 0], collapse = " "), "\n", "Did not find any reads for the following samples: ", 
                paste(samNames[numReads == 0], collapse = " ")), file = "demultiplex_fastq/unmapped_barcodes_samples.txt")
  }

  # Track reads without matches, and write them to an 'orphan' file
  message(paste("Found ", sum(ind_no_match), " reads without a matching barcode (", 
                100 * round(mean(ind_no_match), 4), "%), writing reads to: ", location, 
                "/orpahns.fastq.gz", sep = ""))
  Biostrings::writeQualityScaledXStringSet(reads[c(ind_no_match)], paste(location, 
                                                                         "/orpahns.fastq.gz", sep = ""), compress = T)

  summaryMat <- cbind(bcFile[1:length(barcodes), ], NumberOfReads = numReads)
  write.table(summaryMat, file = paste(location, "/summary.txt", sep = ""), 
              col.names = F, row.names = T, quote = F)
  return(summaryMat)
}



ui <-  navbarPage(
  title = paste("Animalcules Preprocess", sep = ""),
  id="Animalcules Preprocess",
  fluid=TRUE,
  theme = "bootstrap.min.css",
  tabPanel(title = "Demultiplexing Read",
  sidebarLayout(
    sidebarPanel(
#Read in Barcode File 

         fileInput("barcode", "Barcode File",
                accept = c(
                  ".txt")),
         checkboxInput("headerbarcode", "Header", TRUE),
         sliderInput("displaybarcode","View Rows:",min=0,max=0,value=10,step=1),

#Read in Read and Index File          

 fileInput("index", "Index File",
                accept = c(
                  ".fast1.gz")),        
 fileInput("read", "Read File",
                accept = c(
                   ".fast1.gz")),
         sliderInput("display","View Rows:",min=0,max=0,value=10,step=1),
      actionButton("upload","Upload"),

#Hamming, rcBarcode, and Codes

      selectInput("hammingDist","Hamming Distance",choices=1,selected=1),
      checkboxInput("rcBarcodes", "Reverse Compliment the Barcodes", FALSE),

#Specify Output File

       textInput("location", "Output File Location",
                  placeholder = "New Folder"
      ),

#Dimultiplex!

      actionButton("demultiplex","Demultiplex")
    ),

#Index Table Display 

    mainPanel(
      dataTableOutput("tablebarcode"), tableOutput("table")
    )
   )
  )
,
tabPanel(title = "Library Generation",
                            mainPanel(# the following lines could be uncommented when the download ref seq can
                                      # work on the rest of the kingdoms
                                      # radioButtons("kingdom", "Choose a kingdom:",
                                      #              c("Archaea" = "archaea",
                                      #                "Bacteria" = "bacteria",
                                      #                "Fungi" = "fungi",
                                      #                "Invertebrate" = "invertebrate",
                                      #                "Plant" = "plant",
                                      #                "Protozoa" = "protozoa",
                                      #                "Vertebrate" = "vertibrate",
                                      #                "Vertebrate other" = "vertibrate_other",
                                      #                "Virus" = "viral")
                                      #              ),
                                      radioButtons("kingdom", "Choose a kingdom:",
                                                   c("Bacteria" = "bacteria",
                                                     "Virus" = "viral")
                                      ),

                                      # create checkbox input for representative library and reference library
                                      checkboxInput("representative", "representative", value = TRUE, width = NULL),
                                      checkboxInput("reference", "reference", value = FALSE, width = NULL),


                                      actionButton("downloadref","Download Ref_Seq")
                                      )

                   ),

                   tabPanel(title = "Alignment",
                            sidebarLayout(
                              sidebarPanel(

                              ),
                              mainPanel(
                              )
                            )
                   )
  )

server <- function(input, output, session) {

#Barcode File Display  

  #Inputting data
  reactbarcode <- reactive ({
    req(input$barcode)
    barcodeFile <- read.table(input$barcode$datapath, header = input$headerbarcode, sep = "\t")
    barcodeFile 
  })
  #Updates slider
  observe({ 
    updateSliderInput (session,"displaybarcode",min=1,value=10,max=nrow(reactbarcode()))
  })
  #Display Index Data
  updatedisplaybarcode<- reactive({
    displayb<-reactbarcode()[1:input$displaybarcode,]
    return(displayb)
  })
  #Output Data
  output$tablebarcode <- renderDataTable({
    updatedisplaybarcode()[1:input$displaybarcode,]
  })

    #Update Hamming Distance Maximum
    observe({
    updateSelectInput(session,"hammingDist",choices=c(1:nchar(as.character(reactbarcode()[1,2]))),selected = 1)
  })

#Index and Read File Display  

  #Inputting Index data
  reactindex <- reactive ({
    req(input$index)
    indexFile <- Biostrings::readDNAStringSet(input$index$datapath, format = "fastq", n=100)
    indexFile
  })

  #Inputting Read data
  reactread <- reactive ({
    req(input$read)
    readFile<- Biostrings::readQualityScaledDNAStringSet(input$read$datapath, n=100)
    readFile
  })

  #Updates slider
  observe({ 
    updateSliderInput (session,"display",min=1,value=10,max=100)
  })
  observeEvent(input$upload, {

   #Display Data
  updatedisplay<- reactive({
    Index<-reactindex()[1:input$display,]
    Read<-reactread()[1:input$display,]
    x<-cbind.data.frame(Index,Read)
    return(x)
  })
  #Output Data
  output$table <- renderTable({
    updatedisplay()[1:input$display,]
  })})


#rcBarcodes

  rc<-reactive({
    return(as.logical(input$rcBarcodes))
 })  

#hammingDist

  ham<-reactive({
    return(as.numeric(input$hammingDist))
  })


 out<-reactive({
    return(as.character(input$location))
 })  

#Demultiplex
observeEvent(input$demultiplex, {

  demultiplex(barcodeFile=input$barcode$datapath ,indexFile=input$index$datapath ,readFile = input$read$datapath,location=file.path(out(),"/Demultiplexed Sample Files"), rcBarcodes = rc(), cores= cores(), hammingDist= ham()) 

  })

observeEvent(input$downloadref, animacules_download_refseq(input$kingdom, input$representative, input$reference))

}  
shinyApp(ui, server)

wevanjohnson/animalcules.preprocess documentation built on May 11, 2019, 8:26 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

wevanjohnson/animalcules.preprocess
This package contains tools and functions for preprocessing microbiome data

In wevanjohnson/animalcules.preprocess: This package contains tools and functions for preprocessing microbiome data

R Package Documentation

Browse R Packages

We want your feedback!

wevanjohnson/animalcules.preprocess This package contains tools and functions for preprocessing microbiome data

In wevanjohnson/animalcules.preprocess: This package contains tools and functions for preprocessing microbiome data

R Package Documentation

Browse R Packages

We want your feedback!

wevanjohnson/animalcules.preprocess
This package contains tools and functions for preprocessing microbiome data